Branch data Line data Source code
1 : : /* Loop Vectorization
2 : : Copyright (C) 2003-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : : Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #define INCLUDE_ALGORITHM
23 : : #include "config.h"
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "cfghooks.h"
32 : : #include "tree-pass.h"
33 : : #include "ssa.h"
34 : : #include "optabs-tree.h"
35 : : #include "memmodel.h"
36 : : #include "optabs.h"
37 : : #include "diagnostic-core.h"
38 : : #include "fold-const.h"
39 : : #include "stor-layout.h"
40 : : #include "cfganal.h"
41 : : #include "gimplify.h"
42 : : #include "gimple-iterator.h"
43 : : #include "gimplify-me.h"
44 : : #include "tree-ssa-loop-ivopts.h"
45 : : #include "tree-ssa-loop-manip.h"
46 : : #include "tree-ssa-loop-niter.h"
47 : : #include "tree-ssa-loop.h"
48 : : #include "cfgloop.h"
49 : : #include "tree-scalar-evolution.h"
50 : : #include "tree-vectorizer.h"
51 : : #include "gimple-fold.h"
52 : : #include "cgraph.h"
53 : : #include "tree-cfg.h"
54 : : #include "tree-if-conv.h"
55 : : #include "internal-fn.h"
56 : : #include "tree-vector-builder.h"
57 : : #include "vec-perm-indices.h"
58 : : #include "tree-eh.h"
59 : : #include "case-cfn-macros.h"
60 : : #include "langhooks.h"
61 : : #include "opts.h"
62 : :
63 : : /* Loop Vectorization Pass.
64 : :
65 : : This pass tries to vectorize loops.
66 : :
67 : : For example, the vectorizer transforms the following simple loop:
68 : :
69 : : short a[N]; short b[N]; short c[N]; int i;
70 : :
71 : : for (i=0; i<N; i++){
72 : : a[i] = b[i] + c[i];
73 : : }
74 : :
75 : : as if it was manually vectorized by rewriting the source code into:
76 : :
77 : : typedef int __attribute__((mode(V8HI))) v8hi;
78 : : short a[N]; short b[N]; short c[N]; int i;
79 : : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
80 : : v8hi va, vb, vc;
81 : :
82 : : for (i=0; i<N/8; i++){
83 : : vb = pb[i];
84 : : vc = pc[i];
85 : : va = vb + vc;
86 : : pa[i] = va;
87 : : }
88 : :
89 : : The main entry to this pass is vectorize_loops(), in which
90 : : the vectorizer applies a set of analyses on a given set of loops,
91 : : followed by the actual vectorization transformation for the loops that
92 : : had successfully passed the analysis phase.
93 : : Throughout this pass we make a distinction between two types of
94 : : data: scalars (which are represented by SSA_NAMES), and memory references
95 : : ("data-refs"). These two types of data require different handling both
96 : : during analysis and transformation. The types of data-refs that the
97 : : vectorizer currently supports are ARRAY_REFS which base is an array DECL
98 : : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
99 : : accesses are required to have a simple (consecutive) access pattern.
100 : :
101 : : Analysis phase:
102 : : ===============
103 : : The driver for the analysis phase is vect_analyze_loop().
104 : : It applies a set of analyses, some of which rely on the scalar evolution
105 : : analyzer (scev) developed by Sebastian Pop.
106 : :
107 : : During the analysis phase the vectorizer records some information
108 : : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
109 : : loop, as well as general information about the loop as a whole, which is
110 : : recorded in a "loop_vec_info" struct attached to each loop.
111 : :
112 : : Transformation phase:
113 : : =====================
114 : : The loop transformation phase scans all the stmts in the loop, and
115 : : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
116 : : the loop that needs to be vectorized. It inserts the vector code sequence
117 : : just before the scalar stmt S, and records a pointer to the vector code
118 : : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
119 : : attached to S). This pointer will be used for the vectorization of following
120 : : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
121 : : otherwise, we rely on dead code elimination for removing it.
122 : :
123 : : For example, say stmt S1 was vectorized into stmt VS1:
124 : :
125 : : VS1: vb = px[i];
126 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
127 : : S2: a = b;
128 : :
129 : : To vectorize stmt S2, the vectorizer first finds the stmt that defines
130 : : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
131 : : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
132 : : resulting sequence would be:
133 : :
134 : : VS1: vb = px[i];
135 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
136 : : VS2: va = vb;
137 : : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 : :
139 : : Operands that are not SSA_NAMEs, are data-refs that appear in
140 : : load/store operations (like 'x[i]' in S1), and are handled differently.
141 : :
142 : : Target modeling:
143 : : =================
144 : : Currently the only target specific information that is used is the
145 : : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
146 : : Targets that can support different sizes of vectors, for now will need
147 : : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
148 : : flexibility will be added in the future.
149 : :
150 : : Since we only vectorize operations which vector form can be
151 : : expressed using existing tree codes, to verify that an operation is
152 : : supported, the vectorizer checks the relevant optab at the relevant
153 : : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
154 : : the value found is CODE_FOR_nothing, then there's no target support, and
155 : : we can't vectorize the stmt.
156 : :
157 : : For additional information on this project see:
158 : : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
159 : : */
160 : :
161 : : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
162 : : unsigned *);
163 : : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
164 : : gphi **);
165 : :
166 : :
167 : : /* Function vect_is_simple_iv_evolution.
168 : :
169 : : FORNOW: A simple evolution of an induction variables in the loop is
170 : : considered a polynomial evolution. */
171 : :
172 : : static bool
173 : 658934 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn,
174 : : stmt_vec_info stmt_info)
175 : : {
176 : 658934 : tree init_expr;
177 : 658934 : tree step_expr;
178 : 658934 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
179 : 658934 : basic_block bb;
180 : :
181 : : /* When there is no evolution in this loop, the evolution function
182 : : is not "simple". */
183 : 658934 : if (evolution_part == NULL_TREE)
184 : : return false;
185 : :
186 : : /* When the evolution is a polynomial of degree >= 2
187 : : the evolution function is not "simple". */
188 : 700965 : if (tree_is_chrec (evolution_part))
189 : : return false;
190 : :
191 : 609559 : step_expr = evolution_part;
192 : 609559 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
193 : :
194 : 609559 : if (dump_enabled_p ())
195 : 36430 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
196 : : step_expr, init_expr);
197 : :
198 : 609559 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
199 : 609559 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step_expr;
200 : :
201 : 609559 : if (TREE_CODE (step_expr) != INTEGER_CST
202 : 48667 : && (TREE_CODE (step_expr) != SSA_NAME
203 : 40913 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
204 : 40751 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
205 : 6660 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
206 : 111 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
207 : 111 : || !flag_associative_math)))
208 : 651647 : && (TREE_CODE (step_expr) != REAL_CST
209 : 443 : || !flag_associative_math))
210 : : {
211 : 42031 : if (dump_enabled_p ())
212 : 2711 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
213 : : "step unknown.\n");
214 : 42031 : return false;
215 : : }
216 : :
217 : : return true;
218 : : }
219 : :
220 : : /* Function vect_is_nonlinear_iv_evolution
221 : :
222 : : Only support nonlinear induction for integer type
223 : : 1. neg
224 : : 2. mul by constant
225 : : 3. lshift/rshift by constant.
226 : :
227 : : For neg induction, return a fake step as integer -1. */
228 : : static bool
229 : 89061 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
230 : : gphi* loop_phi_node)
231 : : {
232 : 89061 : tree init_expr, ev_expr, result, op1, op2;
233 : 89061 : gimple* def;
234 : :
235 : 89061 : if (gimple_phi_num_args (loop_phi_node) != 2)
236 : : return false;
237 : :
238 : 89061 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
239 : 89061 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
240 : :
241 : : /* Support nonlinear induction only for integer type. */
242 : 89061 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
243 : : return false;
244 : :
245 : 66255 : result = PHI_RESULT (loop_phi_node);
246 : :
247 : 66255 : if (TREE_CODE (ev_expr) != SSA_NAME
248 : 64149 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
249 : 66255 : || !is_gimple_assign (def))
250 : : return false;
251 : :
252 : 59391 : enum tree_code t_code = gimple_assign_rhs_code (def);
253 : 59391 : tree step;
254 : 59391 : switch (t_code)
255 : : {
256 : 1554 : case NEGATE_EXPR:
257 : 1554 : if (gimple_assign_rhs1 (def) != result)
258 : : return false;
259 : 1554 : step = build_int_cst (TREE_TYPE (init_expr), -1);
260 : 1554 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
261 : 1554 : break;
262 : :
263 : 9718 : case RSHIFT_EXPR:
264 : 9718 : case LSHIFT_EXPR:
265 : 9718 : case MULT_EXPR:
266 : 9718 : op1 = gimple_assign_rhs1 (def);
267 : 9718 : op2 = gimple_assign_rhs2 (def);
268 : 9718 : if (TREE_CODE (op2) != INTEGER_CST
269 : 6177 : || op1 != result)
270 : : return false;
271 : 6046 : step = op2;
272 : 6046 : if (t_code == LSHIFT_EXPR)
273 : 186 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
274 : 5860 : else if (t_code == RSHIFT_EXPR)
275 : 5255 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
276 : : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
277 : : else
278 : 605 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
279 : : break;
280 : :
281 : : default:
282 : : return false;
283 : : }
284 : :
285 : 7600 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
286 : 7600 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step;
287 : :
288 : 7600 : return true;
289 : : }
290 : :
291 : : /* Returns true if Phi is a first-order recurrence. A first-order
292 : : recurrence is a non-reduction recurrence relation in which the value of
293 : : the recurrence in the current loop iteration equals a value defined in
294 : : the previous iteration. */
295 : :
296 : : static bool
297 : 20972 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
298 : : gphi *phi)
299 : : {
300 : : /* A nested cycle isn't vectorizable as first order recurrence. */
301 : 20972 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
302 : : return false;
303 : :
304 : : /* Ensure the loop latch definition is from within the loop. */
305 : 20806 : edge latch = loop_latch_edge (loop);
306 : 20806 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
307 : 20806 : if (TREE_CODE (ldef) != SSA_NAME
308 : 18419 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
309 : 18391 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
310 : 37917 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
311 : 4005 : return false;
312 : :
313 : 16801 : tree def = gimple_phi_result (phi);
314 : :
315 : : /* Ensure every use_stmt of the phi node is dominated by the latch
316 : : definition. */
317 : 16801 : imm_use_iterator imm_iter;
318 : 16801 : use_operand_p use_p;
319 : 19084 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
320 : 18591 : if (!is_gimple_debug (USE_STMT (use_p))
321 : 36188 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
322 : 10654 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
323 : : USE_STMT (use_p))))
324 : 16308 : return false;
325 : :
326 : : /* First-order recurrence autovectorization needs shuffle vector. */
327 : 493 : tree scalar_type = TREE_TYPE (def);
328 : 493 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
329 : 493 : if (!vectype)
330 : : return false;
331 : :
332 : : return true;
333 : : }
334 : :
335 : : /* Function vect_analyze_scalar_cycles_1.
336 : :
337 : : Examine the cross iteration def-use cycles of scalar variables
338 : : in LOOP. LOOP_VINFO represents the loop that is now being
339 : : considered for vectorization (can be LOOP, or an outer-loop
340 : : enclosing LOOP). SLP indicates there will be some subsequent
341 : : slp analyses or not. */
342 : :
343 : : static void
344 : 321367 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
345 : : {
346 : 321367 : basic_block bb = loop->header;
347 : 321367 : auto_vec<stmt_vec_info, 64> worklist;
348 : 321367 : gphi_iterator gsi;
349 : :
350 : 321367 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
351 : :
352 : : /* First - identify all inductions. Reduction detection assumes that all the
353 : : inductions have been identified, therefore, this order must not be
354 : : changed. */
355 : 1154704 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
356 : : {
357 : 833337 : gphi *phi = gsi.phi ();
358 : 833337 : tree access_fn = NULL;
359 : 833337 : tree def = PHI_RESULT (phi);
360 : 833337 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
361 : :
362 : : /* Skip virtual phi's. The data dependences that are associated with
363 : : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
364 : 1666674 : if (virtual_operand_p (def))
365 : 258215 : continue;
366 : :
367 : : /* Skip already analyzed inner loop PHIs of double reductions. */
368 : 659828 : if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
369 : 894 : continue;
370 : :
371 : 658934 : if (dump_enabled_p ())
372 : 38216 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
373 : : (gimple *) phi);
374 : :
375 : 658934 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
376 : :
377 : : /* Analyze the evolution function. */
378 : 658934 : access_fn = analyze_scalar_evolution (loop, def);
379 : 658934 : if (dump_enabled_p ())
380 : 38216 : dump_printf_loc (MSG_NOTE, vect_location,
381 : : "Access function of PHI: %T\n", access_fn);
382 : 658934 : if (access_fn)
383 : 658934 : STRIP_NOPS (access_fn);
384 : :
385 : 742746 : if ((!access_fn
386 : 658934 : || !vect_is_simple_iv_evolution (loop->num, access_fn, stmt_vinfo)
387 : 567528 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
388 : 9792 : && (TREE_CODE (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo))
389 : : != INTEGER_CST)))
390 : : /* Only handle nonlinear iv for same loop. */
391 : 750346 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
392 : 89061 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, phi)))
393 : : {
394 : 83812 : worklist.safe_push (stmt_vinfo);
395 : 83812 : continue;
396 : : }
397 : :
398 : 575122 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
399 : : != NULL_TREE);
400 : 575122 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
401 : :
402 : 575122 : if (dump_enabled_p ())
403 : 33823 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
404 : 575122 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
405 : :
406 : : /* Mark if we have a non-linear IV. */
407 : 575122 : LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
408 : 575122 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
409 : : }
410 : :
411 : :
412 : : /* Second - identify all reductions and nested cycles. */
413 : 405179 : while (worklist.length () > 0)
414 : : {
415 : 83812 : stmt_vec_info stmt_vinfo = worklist.pop ();
416 : 83812 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
417 : 83812 : tree def = PHI_RESULT (phi);
418 : :
419 : 83812 : if (dump_enabled_p ())
420 : 4393 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
421 : : (gimple *) phi);
422 : :
423 : 167624 : gcc_assert (!virtual_operand_p (def)
424 : : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
425 : :
426 : 83812 : gphi *double_reduc;
427 : 83812 : stmt_vec_info reduc_stmt_info
428 : 83812 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
429 : 83812 : if (reduc_stmt_info && double_reduc)
430 : : {
431 : 983 : stmt_vec_info inner_phi_info
432 : 983 : = loop_vinfo->lookup_stmt (double_reduc);
433 : : /* ??? Pass down flag we're the inner loop of a double reduc. */
434 : 983 : stmt_vec_info inner_reduc_info
435 : 983 : = vect_is_simple_reduction (loop_vinfo, inner_phi_info, NULL);
436 : 983 : if (inner_reduc_info)
437 : : {
438 : 894 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
439 : 894 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
440 : 894 : STMT_VINFO_REDUC_DEF (inner_phi_info) = inner_reduc_info;
441 : 894 : STMT_VINFO_REDUC_DEF (inner_reduc_info) = inner_phi_info;
442 : 894 : if (dump_enabled_p ())
443 : 118 : dump_printf_loc (MSG_NOTE, vect_location,
444 : : "Detected double reduction.\n");
445 : :
446 : 894 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
447 : 894 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
448 : 894 : STMT_VINFO_DEF_TYPE (inner_phi_info) = vect_nested_cycle;
449 : : /* Make it accessible for SLP vectorization. */
450 : 894 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
451 : : }
452 : 89 : else if (dump_enabled_p ())
453 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
454 : : "Unknown def-use cycle pattern.\n");
455 : : }
456 : 82829 : else if (reduc_stmt_info)
457 : : {
458 : 61857 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
459 : : {
460 : 2185 : if (dump_enabled_p ())
461 : 361 : dump_printf_loc (MSG_NOTE, vect_location,
462 : : "Detected vectorizable nested cycle.\n");
463 : :
464 : 2185 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
465 : : }
466 : : else
467 : : {
468 : 59672 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
469 : 59672 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
470 : 59672 : if (dump_enabled_p ())
471 : 3460 : dump_printf_loc (MSG_NOTE, vect_location,
472 : : "Detected reduction.\n");
473 : :
474 : 59672 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
475 : 59672 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
476 : 59672 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
477 : : }
478 : : }
479 : 20972 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
480 : 487 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
481 : : else
482 : 20485 : if (dump_enabled_p ())
483 : 368 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
484 : : "Unknown def-use cycle pattern.\n");
485 : : }
486 : 321367 : }
487 : :
488 : :
489 : : /* Function vect_analyze_scalar_cycles.
490 : :
491 : : Examine the cross iteration def-use cycles of scalar variables, by
492 : : analyzing the loop-header PHIs of scalar variables. Classify each
493 : : cycle as one of the following: invariant, induction, reduction, unknown.
494 : : We do that for the loop represented by LOOP_VINFO, and also to its
495 : : inner-loop, if exists.
496 : : Examples for scalar cycles:
497 : :
498 : : Example1: reduction:
499 : :
500 : : loop1:
501 : : for (i=0; i<N; i++)
502 : : sum += a[i];
503 : :
504 : : Example2: induction:
505 : :
506 : : loop2:
507 : : for (i=0; i<N; i++)
508 : : a[i] = i; */
509 : :
510 : : static void
511 : 316389 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
512 : : {
513 : 316389 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
514 : :
515 : 316389 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
516 : :
517 : : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
518 : : Reductions in such inner-loop therefore have different properties than
519 : : the reductions in the nest that gets vectorized:
520 : : 1. When vectorized, they are executed in the same order as in the original
521 : : scalar loop, so we can't change the order of computation when
522 : : vectorizing them.
523 : : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
524 : : current checks are too strict. */
525 : :
526 : 316389 : if (loop->inner)
527 : 4978 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
528 : 316389 : }
529 : :
530 : : /* Function vect_get_loop_niters.
531 : :
532 : : Determine how many iterations the loop is executed and place it
533 : : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
534 : : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
535 : : niter information holds in ASSUMPTIONS.
536 : :
537 : : Return the loop exit conditions. */
538 : :
539 : :
540 : : static vec<gcond *>
541 : 263387 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
542 : : tree *number_of_iterations, tree *number_of_iterationsm1)
543 : : {
544 : 263387 : auto_vec<edge> exits = get_loop_exit_edges (loop);
545 : 263387 : vec<gcond *> conds;
546 : 526774 : conds.create (exits.length ());
547 : 263387 : class tree_niter_desc niter_desc;
548 : 263387 : tree niter_assumptions, niter, may_be_zero;
549 : :
550 : 263387 : *assumptions = boolean_true_node;
551 : 263387 : *number_of_iterationsm1 = chrec_dont_know;
552 : 263387 : *number_of_iterations = chrec_dont_know;
553 : :
554 : 263387 : DUMP_VECT_SCOPE ("get_loop_niters");
555 : :
556 : 263387 : if (exits.is_empty ())
557 : 0 : return conds;
558 : :
559 : 263387 : if (dump_enabled_p ())
560 : 13758 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
561 : : exits.length ());
562 : :
563 : : edge exit;
564 : : unsigned int i;
565 : 635903 : FOR_EACH_VEC_ELT (exits, i, exit)
566 : : {
567 : 372516 : gcond *cond = get_loop_exit_condition (exit);
568 : 372516 : if (cond)
569 : 362479 : conds.safe_push (cond);
570 : :
571 : 372516 : if (dump_enabled_p ())
572 : 14728 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
573 : :
574 : 372516 : if (exit != main_exit)
575 : 150005 : continue;
576 : :
577 : 263387 : may_be_zero = NULL_TREE;
578 : 263387 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
579 : 263387 : || chrec_contains_undetermined (niter_desc.niter))
580 : 40876 : continue;
581 : :
582 : 222511 : niter_assumptions = niter_desc.assumptions;
583 : 222511 : may_be_zero = niter_desc.may_be_zero;
584 : 222511 : niter = niter_desc.niter;
585 : :
586 : 222511 : if (may_be_zero && integer_zerop (may_be_zero))
587 : : may_be_zero = NULL_TREE;
588 : :
589 : 13147 : if (may_be_zero)
590 : : {
591 : 13147 : if (COMPARISON_CLASS_P (may_be_zero))
592 : : {
593 : : /* Try to combine may_be_zero with assumptions, this can simplify
594 : : computation of niter expression. */
595 : 13147 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
596 : 1228 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
597 : : niter_assumptions,
598 : : fold_build1 (TRUTH_NOT_EXPR,
599 : : boolean_type_node,
600 : : may_be_zero));
601 : : else
602 : 11919 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
603 : : build_int_cst (TREE_TYPE (niter), 0),
604 : : rewrite_to_non_trapping_overflow (niter));
605 : :
606 : 222511 : may_be_zero = NULL_TREE;
607 : : }
608 : 0 : else if (integer_nonzerop (may_be_zero))
609 : : {
610 : 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
611 : 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
612 : 0 : continue;
613 : : }
614 : : else
615 : 0 : continue;
616 : : }
617 : :
618 : : /* Loop assumptions are based off the normal exit. */
619 : 222511 : *assumptions = niter_assumptions;
620 : 222511 : *number_of_iterationsm1 = niter;
621 : :
622 : : /* We want the number of loop header executions which is the number
623 : : of latch executions plus one.
624 : : ??? For UINT_MAX latch executions this number overflows to zero
625 : : for loops like do { n++; } while (n != 0); */
626 : 222511 : if (niter && !chrec_contains_undetermined (niter))
627 : : {
628 : 222511 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
629 : : unshare_expr (niter),
630 : : build_int_cst (TREE_TYPE (niter), 1));
631 : 222511 : if (TREE_CODE (niter) == INTEGER_CST
632 : 119943 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
633 : : {
634 : : /* If we manage to fold niter + 1 into INTEGER_CST even when
635 : : niter is some complex expression, ensure back
636 : : *number_of_iterationsm1 is an INTEGER_CST as well. See
637 : : PR113210. */
638 : 4 : *number_of_iterationsm1
639 : 4 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
640 : : build_minus_one_cst (TREE_TYPE (niter)));
641 : : }
642 : : }
643 : 222511 : *number_of_iterations = niter;
644 : : }
645 : :
646 : 263387 : if (dump_enabled_p ())
647 : 13758 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
648 : :
649 : 263387 : return conds;
650 : 263387 : }
651 : :
652 : : /* Determine the main loop exit for the vectorizer. */
653 : :
654 : : edge
655 : 495925 : vec_init_loop_exit_info (class loop *loop)
656 : : {
657 : : /* Before we begin we must first determine which exit is the main one and
658 : : which are auxilary exits. */
659 : 495925 : auto_vec<edge> exits = get_loop_exit_edges (loop);
660 : 495925 : if (exits.length () == 1)
661 : 316545 : return exits[0];
662 : :
663 : : /* If we have multiple exits we only support counting IV at the moment.
664 : : Analyze all exits and return the last one we can analyze. */
665 : 179380 : class tree_niter_desc niter_desc;
666 : 179380 : edge candidate = NULL;
667 : 1170924 : for (edge exit : exits)
668 : : {
669 : 642686 : if (!get_loop_exit_condition (exit))
670 : 149973 : continue;
671 : :
672 : 492713 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
673 : 492713 : && !chrec_contains_undetermined (niter_desc.niter))
674 : : {
675 : 137080 : tree may_be_zero = niter_desc.may_be_zero;
676 : 137080 : if ((integer_zerop (may_be_zero)
677 : : /* As we are handling may_be_zero that's not false by
678 : : rewriting niter to may_be_zero ? 0 : niter we require
679 : : an empty latch. */
680 : 655789 : || (single_pred_p (loop->latch)
681 : 12745 : && exit->src == single_pred (loop->latch)
682 : 4263 : && (integer_nonzerop (may_be_zero)
683 : 4263 : || COMPARISON_CLASS_P (may_be_zero))))
684 : 141343 : && (!candidate
685 : 6655 : || dominated_by_p (CDI_DOMINATORS, exit->src,
686 : 6655 : candidate->src)))
687 : : candidate = exit;
688 : : }
689 : : }
690 : :
691 : 179380 : return candidate;
692 : 179380 : }
693 : :
694 : : /* Function bb_in_loop_p
695 : :
696 : : Used as predicate for dfs order traversal of the loop bbs. */
697 : :
698 : : static bool
699 : 1305201 : bb_in_loop_p (const_basic_block bb, const void *data)
700 : : {
701 : 1305201 : const class loop *const loop = (const class loop *)data;
702 : 1305201 : if (flow_bb_inside_loop_p (loop, bb))
703 : : return true;
704 : : return false;
705 : : }
706 : :
707 : :
708 : : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
709 : : stmt_vec_info structs for all the stmts in LOOP_IN. */
710 : :
711 : 411411 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
712 : : : vec_info (vec_info::loop, shared),
713 : 411411 : loop (loop_in),
714 : 411411 : num_itersm1 (NULL_TREE),
715 : 411411 : num_iters (NULL_TREE),
716 : 411411 : num_iters_unchanged (NULL_TREE),
717 : 411411 : num_iters_assumptions (NULL_TREE),
718 : 411411 : vector_costs (nullptr),
719 : 411411 : scalar_costs (nullptr),
720 : 411411 : th (0),
721 : 411411 : versioning_threshold (0),
722 : 411411 : vectorization_factor (0),
723 : 411411 : main_loop_edge (nullptr),
724 : 411411 : skip_main_loop_edge (nullptr),
725 : 411411 : skip_this_loop_edge (nullptr),
726 : 411411 : reusable_accumulators (),
727 : 411411 : suggested_unroll_factor (1),
728 : 411411 : max_vectorization_factor (0),
729 : 411411 : mask_skip_niters (NULL_TREE),
730 : 411411 : mask_skip_niters_pfa_offset (NULL_TREE),
731 : 411411 : rgroup_compare_type (NULL_TREE),
732 : 411411 : simd_if_cond (NULL_TREE),
733 : 411411 : partial_vector_style (vect_partial_vectors_none),
734 : 411411 : unaligned_dr (NULL),
735 : 411411 : peeling_for_alignment (0),
736 : 411411 : ptr_mask (0),
737 : 411411 : max_spec_read_amount (0),
738 : 411411 : nonlinear_iv (false),
739 : 411411 : ivexpr_map (NULL),
740 : 411411 : scan_map (NULL),
741 : 411411 : slp_unrolling_factor (1),
742 : 411411 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
743 : 411411 : vectorizable (false),
744 : 411411 : can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
745 : 411411 : must_use_partial_vectors_p (false),
746 : 411411 : using_partial_vectors_p (false),
747 : 411411 : using_decrementing_iv_p (false),
748 : 411411 : using_select_vl_p (false),
749 : 411411 : epil_using_partial_vectors_p (false),
750 : 411411 : allow_mutual_alignment (false),
751 : 411411 : partial_load_store_bias (0),
752 : 411411 : peeling_for_gaps (false),
753 : 411411 : peeling_for_niter (false),
754 : 411411 : early_breaks (false),
755 : 411411 : user_unroll (false),
756 : 411411 : no_data_dependencies (false),
757 : 411411 : has_mask_store (false),
758 : 411411 : scalar_loop_scaling (profile_probability::uninitialized ()),
759 : 411411 : scalar_loop (NULL),
760 : 411411 : main_loop_info (NULL),
761 : 411411 : orig_loop_info (NULL),
762 : 411411 : epilogue_vinfo (NULL),
763 : 411411 : drs_advanced_by (NULL_TREE),
764 : 411411 : vec_loop_iv_exit (NULL),
765 : 411411 : vec_epilogue_loop_iv_exit (NULL),
766 : 411411 : scalar_loop_iv_exit (NULL)
767 : : {
768 : : /* CHECKME: We want to visit all BBs before their successors (except for
769 : : latch blocks, for which this assertion wouldn't hold). In the simple
770 : : case of the loop forms we allow, a dfs order of the BBs would the same
771 : : as reversed postorder traversal, so we are safe. */
772 : :
773 : 411411 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
774 : 822822 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
775 : 411411 : loop->num_nodes, loop);
776 : 411411 : gcc_assert (nbbs == loop->num_nodes);
777 : :
778 : 1488059 : for (unsigned int i = 0; i < nbbs; i++)
779 : : {
780 : 1076648 : basic_block bb = bbs[i];
781 : 1076648 : gimple_stmt_iterator si;
782 : :
783 : 2186672 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
784 : : {
785 : 1110024 : gimple *phi = gsi_stmt (si);
786 : 1110024 : gimple_set_uid (phi, 0);
787 : 1110024 : add_stmt (phi);
788 : : }
789 : :
790 : 9239353 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
791 : : {
792 : 7086057 : gimple *stmt = gsi_stmt (si);
793 : 7086057 : gimple_set_uid (stmt, 0);
794 : 7086057 : if (is_gimple_debug (stmt))
795 : 2686531 : continue;
796 : 4399526 : add_stmt (stmt);
797 : : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
798 : : third argument is the #pragma omp simd if (x) condition, when 0,
799 : : loop shouldn't be vectorized, when non-zero constant, it should
800 : : be vectorized normally, otherwise versioned with vectorized loop
801 : : done if the condition is non-zero at runtime. */
802 : 4399526 : if (loop_in->simduid
803 : 43319 : && is_gimple_call (stmt)
804 : 4262 : && gimple_call_internal_p (stmt)
805 : 4135 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
806 : 4131 : && gimple_call_num_args (stmt) >= 3
807 : 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
808 : 4399629 : && (loop_in->simduid
809 : 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
810 : : {
811 : 103 : tree arg = gimple_call_arg (stmt, 2);
812 : 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
813 : 103 : simd_if_cond = arg;
814 : : else
815 : 0 : gcc_assert (integer_nonzerop (arg));
816 : : }
817 : : }
818 : : }
819 : 411411 : }
820 : :
821 : : /* Free all levels of rgroup CONTROLS. */
822 : :
823 : : void
824 : 1063110 : release_vec_loop_controls (vec<rgroup_controls> *controls)
825 : : {
826 : 1063110 : rgroup_controls *rgc;
827 : 1063110 : unsigned int i;
828 : 1063155 : FOR_EACH_VEC_ELT (*controls, i, rgc)
829 : 45 : rgc->controls.release ();
830 : 1063110 : controls->release ();
831 : 1063110 : }
832 : :
833 : : /* Free all memory used by the _loop_vec_info, as well as all the
834 : : stmt_vec_info structs of all the stmts in the loop. */
835 : :
836 : 411411 : _loop_vec_info::~_loop_vec_info ()
837 : : {
838 : 411411 : free (bbs);
839 : :
840 : 411411 : release_vec_loop_controls (&masks.rgc_vec);
841 : 411411 : release_vec_loop_controls (&lens);
842 : 415217 : delete ivexpr_map;
843 : 411733 : delete scan_map;
844 : 411411 : delete scalar_costs;
845 : 411411 : delete vector_costs;
846 : 549716 : for (auto reduc_info : reduc_infos)
847 : 131890 : delete reduc_info;
848 : :
849 : : /* When we release an epiloge vinfo that we do not intend to use
850 : : avoid clearing AUX of the main loop which should continue to
851 : : point to the main loop vinfo since otherwise we'll leak that. */
852 : 411411 : if (loop->aux == this)
853 : 59215 : loop->aux = NULL;
854 : 822822 : }
855 : :
856 : : /* Return an invariant or register for EXPR and emit necessary
857 : : computations in the LOOP_VINFO loop preheader. */
858 : :
859 : : tree
860 : 19641 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
861 : : {
862 : 19641 : if (is_gimple_reg (expr)
863 : 19641 : || is_gimple_min_invariant (expr))
864 : 6479 : return expr;
865 : :
866 : 13162 : if (! loop_vinfo->ivexpr_map)
867 : 3806 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
868 : 13162 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
869 : 13162 : if (! cached)
870 : : {
871 : 8474 : gimple_seq stmts = NULL;
872 : 8474 : cached = force_gimple_operand (unshare_expr (expr),
873 : : &stmts, true, NULL_TREE);
874 : 8474 : if (stmts)
875 : : {
876 : 8334 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
877 : 8334 : gsi_insert_seq_on_edge_immediate (e, stmts);
878 : : }
879 : : }
880 : 13162 : return cached;
881 : : }
882 : :
883 : : /* Return true if we can use CMP_TYPE as the comparison type to produce
884 : : all masks required to mask LOOP_VINFO. */
885 : :
886 : : static bool
887 : 129 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
888 : : {
889 : 129 : rgroup_controls *rgm;
890 : 129 : unsigned int i;
891 : 181 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
892 : 181 : if (rgm->type != NULL_TREE
893 : 181 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
894 : : cmp_type, rgm->type,
895 : : OPTIMIZE_FOR_SPEED))
896 : : return false;
897 : : return true;
898 : : }
899 : :
900 : : /* Calculate the maximum number of scalars per iteration for every
901 : : rgroup in LOOP_VINFO. */
902 : :
903 : : static unsigned int
904 : 31 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
905 : : {
906 : 31 : unsigned int res = 1;
907 : 31 : unsigned int i;
908 : 31 : rgroup_controls *rgm;
909 : 216 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
910 : 185 : res = MAX (res, rgm->max_nscalars_per_iter);
911 : 31 : return res;
912 : : }
913 : :
914 : : /* Calculate the minimum precision necessary to represent:
915 : :
916 : : MAX_NITERS * FACTOR
917 : :
918 : : as an unsigned integer, where MAX_NITERS is the maximum number of
919 : : loop header iterations for the original scalar form of LOOP_VINFO. */
920 : :
921 : : static unsigned
922 : 31 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
923 : : {
924 : 31 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
925 : :
926 : : /* Get the maximum number of iterations that is representable
927 : : in the counter type. */
928 : 31 : tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
929 : 31 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
930 : :
931 : : /* Get a more refined estimate for the number of iterations. */
932 : 31 : widest_int max_back_edges;
933 : 31 : if (max_loop_iterations (loop, &max_back_edges))
934 : 31 : max_ni = wi::smin (max_ni, max_back_edges + 1);
935 : :
936 : : /* Work out how many bits we need to represent the limit. */
937 : 31 : return wi::min_precision (max_ni * factor, UNSIGNED);
938 : 31 : }
939 : :
940 : : /* True if the loop needs peeling or partial vectors when vectorized. */
941 : :
942 : : static bool
943 : 117060 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
944 : : {
945 : 117060 : unsigned HOST_WIDE_INT const_vf;
946 : 117060 : HOST_WIDE_INT max_niter
947 : 117060 : = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
948 : :
949 : 117060 : unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
950 : 117060 : if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
951 : 15093 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
952 : : (loop_vinfo));
953 : :
954 : 117060 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
955 : 54120 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
956 : : {
957 : : /* Work out the (constant) number of iterations that need to be
958 : : peeled for reasons other than niters. */
959 : 54084 : unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
960 : 54084 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
961 : 387 : peel_niter += 1;
962 : 115964 : if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
963 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
964 : : return true;
965 : : }
966 : 62976 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
967 : : /* ??? When peeling for gaps but not alignment, we could
968 : : try to check whether the (variable) niters is known to be
969 : : VF * N + 1. That's something of a niche case though. */
970 : 62743 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
971 : 61870 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
972 : 124846 : || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
973 : 123740 : < (unsigned) exact_log2 (const_vf))
974 : : /* In case of versioning, check if the maximum number of
975 : : iterations is greater than th. If they are identical,
976 : : the epilogue is unnecessary. */
977 : 60800 : && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
978 : 4356 : || ((unsigned HOST_WIDE_INT) max_niter
979 : : /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
980 : : but that's only computed later based on our result.
981 : : The following is the most conservative approximation. */
982 : 4356 : > (std::max ((unsigned HOST_WIDE_INT) th,
983 : 4356 : const_vf) / const_vf) * const_vf))))
984 : 61880 : return true;
985 : :
986 : : return false;
987 : : }
988 : :
989 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
990 : : whether we can actually generate the masks required. Return true if so,
991 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
992 : :
993 : : static bool
994 : 31 : vect_verify_full_masking (loop_vec_info loop_vinfo)
995 : : {
996 : 31 : unsigned int min_ni_width;
997 : :
998 : : /* Use a normal loop if there are no statements that need masking.
999 : : This only happens in rare degenerate cases: it means that the loop
1000 : : has no loads, no stores, and no live-out values. */
1001 : 31 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1002 : : return false;
1003 : :
1004 : : /* Produce the rgroup controls. */
1005 : 113 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1006 : : {
1007 : 41 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1008 : 41 : tree vectype = mask.first;
1009 : 41 : unsigned nvectors = mask.second;
1010 : :
1011 : 51 : if (masks->rgc_vec.length () < nvectors)
1012 : 34 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1013 : 41 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1014 : : /* The number of scalars per iteration and the number of vectors are
1015 : : both compile-time constants. */
1016 : 41 : unsigned int nscalars_per_iter
1017 : 41 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1018 : 41 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1019 : :
1020 : 41 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1021 : : {
1022 : 41 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1023 : 41 : rgm->type = truth_type_for (vectype);
1024 : 41 : rgm->factor = 1;
1025 : : }
1026 : : }
1027 : :
1028 : 31 : unsigned int max_nscalars_per_iter
1029 : 31 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1030 : :
1031 : : /* Work out how many bits we need to represent the limit. */
1032 : 31 : min_ni_width
1033 : 31 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1034 : :
1035 : : /* Find a scalar mode for which WHILE_ULT is supported. */
1036 : 31 : opt_scalar_int_mode cmp_mode_iter;
1037 : 31 : tree cmp_type = NULL_TREE;
1038 : 31 : tree iv_type = NULL_TREE;
1039 : 31 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1040 : 31 : unsigned int iv_precision = UINT_MAX;
1041 : :
1042 : 31 : if (iv_limit != -1)
1043 : 31 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1044 : : UNSIGNED);
1045 : :
1046 : 248 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1047 : : {
1048 : 217 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1049 : 217 : if (cmp_bits >= min_ni_width
1050 : 217 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1051 : : {
1052 : 129 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1053 : 129 : if (this_type
1054 : 129 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1055 : : {
1056 : : /* Although we could stop as soon as we find a valid mode,
1057 : : there are at least two reasons why that's not always the
1058 : : best choice:
1059 : :
1060 : : - An IV that's Pmode or wider is more likely to be reusable
1061 : : in address calculations than an IV that's narrower than
1062 : : Pmode.
1063 : :
1064 : : - Doing the comparison in IV_PRECISION or wider allows
1065 : : a natural 0-based IV, whereas using a narrower comparison
1066 : : type requires mitigations against wrap-around.
1067 : :
1068 : : Conversely, if the IV limit is variable, doing the comparison
1069 : : in a wider type than the original type can introduce
1070 : : unnecessary extensions, so picking the widest valid mode
1071 : : is not always a good choice either.
1072 : :
1073 : : Here we prefer the first IV type that's Pmode or wider,
1074 : : and the first comparison type that's IV_PRECISION or wider.
1075 : : (The comparison type must be no wider than the IV type,
1076 : : to avoid extensions in the vector loop.)
1077 : :
1078 : : ??? We might want to try continuing beyond Pmode for ILP32
1079 : : targets if CMP_BITS < IV_PRECISION. */
1080 : 0 : iv_type = this_type;
1081 : 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1082 : : cmp_type = this_type;
1083 : 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1084 : : break;
1085 : : }
1086 : : }
1087 : : }
1088 : :
1089 : 31 : if (!cmp_type)
1090 : : {
1091 : 31 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1092 : 31 : return false;
1093 : : }
1094 : :
1095 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1096 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1097 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1098 : 0 : return true;
1099 : 31 : }
1100 : :
1101 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1102 : : whether we can actually generate AVX512 style masks. Return true if so,
1103 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1104 : :
1105 : : static bool
1106 : 31 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1107 : : {
1108 : : /* Produce differently organized rgc_vec and differently check
1109 : : we can produce masks. */
1110 : :
1111 : : /* Use a normal loop if there are no statements that need masking.
1112 : : This only happens in rare degenerate cases: it means that the loop
1113 : : has no loads, no stores, and no live-out values. */
1114 : 31 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1115 : : return false;
1116 : :
1117 : : /* For the decrementing IV we need to represent all values in
1118 : : [0, niter + niter_skip] where niter_skip is the elements we
1119 : : skip in the first iteration for prologue peeling. */
1120 : 31 : tree iv_type = NULL_TREE;
1121 : 31 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1122 : 31 : unsigned int iv_precision = UINT_MAX;
1123 : 31 : if (iv_limit != -1)
1124 : 31 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1125 : :
1126 : : /* First compute the type for the IV we use to track the remaining
1127 : : scalar iterations. */
1128 : 31 : opt_scalar_int_mode cmp_mode_iter;
1129 : 58 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1130 : : {
1131 : 58 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1132 : 58 : if (cmp_bits >= iv_precision
1133 : 58 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1134 : : {
1135 : 31 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1136 : 31 : if (iv_type)
1137 : : break;
1138 : : }
1139 : : }
1140 : 31 : if (!iv_type)
1141 : : return false;
1142 : :
1143 : : /* Produce the rgroup controls. */
1144 : 113 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1145 : : {
1146 : 41 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1147 : 41 : tree vectype = mask.first;
1148 : 41 : unsigned nvectors = mask.second;
1149 : :
1150 : : /* The number of scalars per iteration and the number of vectors are
1151 : : both compile-time constants. */
1152 : 41 : unsigned int nscalars_per_iter
1153 : 41 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1154 : 41 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1155 : :
1156 : : /* We index the rgroup_controls vector with nscalars_per_iter
1157 : : which we keep constant and instead have a varying nvectors,
1158 : : remembering the vector mask with the fewest nV. */
1159 : 51 : if (masks->rgc_vec.length () < nscalars_per_iter)
1160 : 33 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1161 : 41 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1162 : :
1163 : 41 : if (!rgm->type || rgm->factor > nvectors)
1164 : : {
1165 : 40 : rgm->type = truth_type_for (vectype);
1166 : 40 : rgm->compare_type = NULL_TREE;
1167 : 40 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1168 : 40 : rgm->factor = nvectors;
1169 : 40 : rgm->bias_adjusted_ctrl = NULL_TREE;
1170 : : }
1171 : : }
1172 : :
1173 : : /* There is no fixed compare type we are going to use but we have to
1174 : : be able to get at one for each mask group. */
1175 : 31 : unsigned int min_ni_width
1176 : 31 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1177 : :
1178 : 31 : bool ok = true;
1179 : 138 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1180 : : {
1181 : 45 : tree mask_type = rgc.type;
1182 : 45 : if (!mask_type)
1183 : 10 : continue;
1184 : :
1185 : : /* For now vect_get_loop_mask only supports integer mode masks
1186 : : when we need to split it. */
1187 : 35 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1188 : 35 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1189 : : {
1190 : : ok = false;
1191 : : break;
1192 : : }
1193 : :
1194 : : /* If iv_type is usable as compare type use that - we can elide the
1195 : : saturation in that case. */
1196 : 35 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1197 : : {
1198 : 35 : tree cmp_vectype
1199 : 35 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1200 : 35 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1201 : 6 : rgc.compare_type = cmp_vectype;
1202 : : }
1203 : 35 : if (!rgc.compare_type)
1204 : 66 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1205 : : {
1206 : 66 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1207 : 66 : if (cmp_bits >= min_ni_width
1208 : 66 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1209 : : {
1210 : 66 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1211 : 66 : if (!cmp_type)
1212 : 0 : continue;
1213 : :
1214 : : /* Check whether we can produce the mask with cmp_type. */
1215 : 66 : tree cmp_vectype
1216 : 66 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1217 : 66 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1218 : : {
1219 : 29 : rgc.compare_type = cmp_vectype;
1220 : 29 : break;
1221 : : }
1222 : : }
1223 : : }
1224 : 35 : if (!rgc.compare_type)
1225 : : {
1226 : : ok = false;
1227 : : break;
1228 : : }
1229 : : }
1230 : 31 : if (!ok)
1231 : : {
1232 : 0 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1233 : 0 : return false;
1234 : : }
1235 : :
1236 : 31 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1237 : 31 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1238 : 31 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1239 : 31 : return true;
1240 : 31 : }
1241 : :
1242 : : /* Check whether we can use vector access with length based on precison
1243 : : comparison. So far, to keep it simple, we only allow the case that the
1244 : : precision of the target supported length is larger than the precision
1245 : : required by loop niters. */
1246 : :
1247 : : static bool
1248 : 0 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1249 : : {
1250 : 0 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1251 : : return false;
1252 : :
1253 : 0 : if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
1254 : : return false;
1255 : :
1256 : 0 : machine_mode len_load_mode, len_store_mode;
1257 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1258 : 0 : .exists (&len_load_mode))
1259 : 0 : return false;
1260 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1261 : 0 : .exists (&len_store_mode))
1262 : 0 : return false;
1263 : :
1264 : 0 : signed char partial_load_bias = internal_len_load_store_bias
1265 : 0 : (IFN_LEN_LOAD, len_load_mode);
1266 : :
1267 : 0 : signed char partial_store_bias = internal_len_load_store_bias
1268 : 0 : (IFN_LEN_STORE, len_store_mode);
1269 : :
1270 : 0 : gcc_assert (partial_load_bias == partial_store_bias);
1271 : :
1272 : 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1273 : : return false;
1274 : :
1275 : : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1276 : : len_loads with a length of zero. In order to avoid that we prohibit
1277 : : more than one loop length here. */
1278 : 0 : if (partial_load_bias == -1
1279 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1280 : : return false;
1281 : :
1282 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1283 : :
1284 : 0 : unsigned int max_nitems_per_iter = 1;
1285 : 0 : unsigned int i;
1286 : 0 : rgroup_controls *rgl;
1287 : : /* Find the maximum number of items per iteration for every rgroup. */
1288 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1289 : : {
1290 : 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1291 : 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1292 : : }
1293 : :
1294 : : /* Work out how many bits we need to represent the length limit. */
1295 : 0 : unsigned int min_ni_prec
1296 : 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1297 : :
1298 : : /* Now use the maximum of below precisions for one suitable IV type:
1299 : : - the IV's natural precision
1300 : : - the precision needed to hold: the maximum number of scalar
1301 : : iterations multiplied by the scale factor (min_ni_prec above)
1302 : : - the Pmode precision
1303 : :
1304 : : If min_ni_prec is less than the precision of the current niters,
1305 : : we perfer to still use the niters type. Prefer to use Pmode and
1306 : : wider IV to avoid narrow conversions. */
1307 : :
1308 : 0 : unsigned int ni_prec
1309 : 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1310 : 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1311 : 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1312 : :
1313 : 0 : tree iv_type = NULL_TREE;
1314 : 0 : opt_scalar_int_mode tmode_iter;
1315 : 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1316 : : {
1317 : 0 : scalar_mode tmode = tmode_iter.require ();
1318 : 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1319 : :
1320 : : /* ??? Do we really want to construct one IV whose precision exceeds
1321 : : BITS_PER_WORD? */
1322 : 0 : if (tbits > BITS_PER_WORD)
1323 : : break;
1324 : :
1325 : : /* Find the first available standard integral type. */
1326 : 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1327 : : {
1328 : 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1329 : 0 : break;
1330 : : }
1331 : : }
1332 : :
1333 : 0 : if (!iv_type)
1334 : : {
1335 : 0 : if (dump_enabled_p ())
1336 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1337 : : "can't vectorize with length-based partial vectors"
1338 : : " because there is no suitable iv type.\n");
1339 : 0 : return false;
1340 : : }
1341 : :
1342 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1343 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1344 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1345 : :
1346 : 0 : return true;
1347 : : }
1348 : :
1349 : : /* Calculate the cost of one scalar iteration of the loop. */
1350 : : static void
1351 : 281765 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1352 : : {
1353 : 281765 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1354 : 281765 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1355 : 281765 : int nbbs = loop->num_nodes, factor;
1356 : 281765 : int innerloop_iters, i;
1357 : :
1358 : 281765 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1359 : :
1360 : : /* Gather costs for statements in the scalar loop. */
1361 : :
1362 : : /* FORNOW. */
1363 : 281765 : innerloop_iters = 1;
1364 : 281765 : if (loop->inner)
1365 : 1266 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1366 : :
1367 : 1003602 : for (i = 0; i < nbbs; i++)
1368 : : {
1369 : 721837 : gimple_stmt_iterator si;
1370 : 721837 : basic_block bb = bbs[i];
1371 : :
1372 : 721837 : if (bb->loop_father == loop->inner)
1373 : : factor = innerloop_iters;
1374 : : else
1375 : 719305 : factor = 1;
1376 : :
1377 : 5755546 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1378 : : {
1379 : 4311872 : gimple *stmt = gsi_stmt (si);
1380 : 4311872 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1381 : :
1382 : 4311872 : if (!is_gimple_assign (stmt)
1383 : : && !is_gimple_call (stmt)
1384 : : && !is_a<gcond *> (stmt))
1385 : 1568859 : continue;
1386 : :
1387 : : /* Skip stmts that are not vectorized inside the loop. */
1388 : 2743013 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1389 : 2743013 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1390 : 1165655 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1391 : 59 : || !VECTORIZABLE_CYCLE_DEF
1392 : : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1393 : 1165655 : continue;
1394 : :
1395 : 1577358 : vect_cost_for_stmt kind;
1396 : 1577358 : if (STMT_VINFO_DATA_REF (stmt_info))
1397 : : {
1398 : 668854 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1399 : : kind = scalar_load;
1400 : : else
1401 : 236338 : kind = scalar_store;
1402 : : }
1403 : 908504 : else if (vect_nop_conversion_p (stmt_info))
1404 : 40248 : continue;
1405 : : else
1406 : : kind = scalar_stmt;
1407 : :
1408 : : /* We are using vect_prologue here to avoid scaling twice
1409 : : by the inner loop factor. */
1410 : 1537110 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1411 : : factor, kind, stmt_info, 0, vect_prologue);
1412 : : }
1413 : : }
1414 : :
1415 : : /* Now accumulate cost. */
1416 : 281765 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1417 : 281765 : add_stmt_costs (loop_vinfo->scalar_costs,
1418 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1419 : 281765 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1420 : 281765 : }
1421 : :
1422 : : /* Function vect_analyze_loop_form.
1423 : :
1424 : : Verify that certain CFG restrictions hold, including:
1425 : : - the loop has a pre-header
1426 : : - the loop has a single entry
1427 : : - nested loops can have only a single exit.
1428 : : - the loop exit condition is simple enough
1429 : : - the number of iterations can be analyzed, i.e, a countable loop. The
1430 : : niter could be analyzed under some assumptions. */
1431 : :
1432 : : opt_result
1433 : 463049 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1434 : : vect_loop_form_info *info)
1435 : : {
1436 : 463049 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1437 : :
1438 : 463049 : edge exit_e = vec_init_loop_exit_info (loop);
1439 : 463049 : if (!exit_e)
1440 : 57795 : return opt_result::failure_at (vect_location,
1441 : : "not vectorized:"
1442 : : " could not determine main exit from"
1443 : : " loop with multiple exits.\n");
1444 : 405254 : if (loop_vectorized_call)
1445 : : {
1446 : 25828 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1447 : 25828 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1448 : 25828 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1449 : 25828 : if (!scalar_exit_e)
1450 : 0 : return opt_result::failure_at (vect_location,
1451 : : "not vectorized:"
1452 : : " could not determine main exit from"
1453 : : " loop with multiple exits.\n");
1454 : : }
1455 : :
1456 : 405254 : info->loop_exit = exit_e;
1457 : 405254 : if (dump_enabled_p ())
1458 : 15135 : dump_printf_loc (MSG_NOTE, vect_location,
1459 : : "using as main loop exit: %d -> %d [AUX: %p]\n",
1460 : 15135 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1461 : :
1462 : : /* Check if we have any control flow that doesn't leave the loop. */
1463 : 405254 : basic_block *bbs = get_loop_body (loop);
1464 : 1361475 : for (unsigned i = 0; i < loop->num_nodes; i++)
1465 : 1065022 : if (EDGE_COUNT (bbs[i]->succs) != 1
1466 : 1065022 : && (EDGE_COUNT (bbs[i]->succs) != 2
1467 : 628854 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1468 : : {
1469 : 108801 : free (bbs);
1470 : 108801 : return opt_result::failure_at (vect_location,
1471 : : "not vectorized:"
1472 : : " unsupported control flow in loop.\n");
1473 : : }
1474 : 296453 : free (bbs);
1475 : :
1476 : : /* Different restrictions apply when we are considering an inner-most loop,
1477 : : vs. an outer (nested) loop.
1478 : : (FORNOW. May want to relax some of these restrictions in the future). */
1479 : :
1480 : 296453 : info->inner_loop_cond = NULL;
1481 : 296453 : if (!loop->inner)
1482 : : {
1483 : : /* Inner-most loop. */
1484 : :
1485 : 274969 : if (empty_block_p (loop->header))
1486 : 3 : return opt_result::failure_at (vect_location,
1487 : : "not vectorized: empty loop.\n");
1488 : : }
1489 : : else
1490 : : {
1491 : 21484 : class loop *innerloop = loop->inner;
1492 : 21484 : edge entryedge;
1493 : :
1494 : : /* Nested loop. We currently require that the loop is doubly-nested,
1495 : : contains a single inner loop with a single exit to the block
1496 : : with the single exit condition in the outer loop.
1497 : : Vectorizable outer-loops look like this:
1498 : :
1499 : : (pre-header)
1500 : : |
1501 : : header <---+
1502 : : | |
1503 : : inner-loop |
1504 : : | |
1505 : : tail ------+
1506 : : |
1507 : : (exit-bb)
1508 : :
1509 : : The inner-loop also has the properties expected of inner-most loops
1510 : : as described above. */
1511 : :
1512 : 21484 : if ((loop->inner)->inner || (loop->inner)->next)
1513 : 3024 : return opt_result::failure_at (vect_location,
1514 : : "not vectorized:"
1515 : : " multiple nested loops.\n");
1516 : :
1517 : 18460 : entryedge = loop_preheader_edge (innerloop);
1518 : 18460 : if (entryedge->src != loop->header
1519 : 18114 : || !single_exit (innerloop)
1520 : 29616 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1521 : 7586 : return opt_result::failure_at (vect_location,
1522 : : "not vectorized:"
1523 : : " unsupported outerloop form.\n");
1524 : :
1525 : : /* Analyze the inner-loop. */
1526 : 10874 : vect_loop_form_info inner;
1527 : 10874 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1528 : 10874 : if (!res)
1529 : : {
1530 : 1169 : if (dump_enabled_p ())
1531 : 5 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1532 : : "not vectorized: Bad inner loop.\n");
1533 : 1169 : return res;
1534 : : }
1535 : :
1536 : : /* Don't support analyzing niter under assumptions for inner
1537 : : loop. */
1538 : 9705 : if (!integer_onep (inner.assumptions))
1539 : 303 : return opt_result::failure_at (vect_location,
1540 : : "not vectorized: Bad inner loop.\n");
1541 : :
1542 : 9402 : if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1543 : 1084 : return opt_result::failure_at (vect_location,
1544 : : "not vectorized: inner-loop count not"
1545 : : " invariant.\n");
1546 : :
1547 : 8318 : if (dump_enabled_p ())
1548 : 943 : dump_printf_loc (MSG_NOTE, vect_location,
1549 : : "Considering outer-loop vectorization.\n");
1550 : 8318 : info->inner_loop_cond = inner.conds[0];
1551 : 10874 : }
1552 : :
1553 : 283284 : if (EDGE_COUNT (loop->header->preds) != 2)
1554 : 0 : return opt_result::failure_at (vect_location,
1555 : : "not vectorized:"
1556 : : " too many incoming edges.\n");
1557 : :
1558 : : /* We assume that the latch is empty. */
1559 : 283284 : basic_block latch = loop->latch;
1560 : 283284 : do
1561 : : {
1562 : 283284 : if (!empty_block_p (latch)
1563 : 283284 : || !gimple_seq_empty_p (phi_nodes (latch)))
1564 : 19853 : return opt_result::failure_at (vect_location,
1565 : : "not vectorized: latch block not "
1566 : : "empty.\n");
1567 : 263431 : latch = single_pred (latch);
1568 : : }
1569 : 526862 : while (single_succ_p (latch));
1570 : :
1571 : : /* Make sure there is no abnormal exit. */
1572 : 263431 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1573 : 1162820 : for (edge e : exits)
1574 : : {
1575 : 372571 : if (e->flags & EDGE_ABNORMAL)
1576 : 44 : return opt_result::failure_at (vect_location,
1577 : : "not vectorized:"
1578 : : " abnormal loop exit edge.\n");
1579 : : }
1580 : :
1581 : 263387 : info->conds
1582 : 263387 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1583 : : &info->number_of_iterations,
1584 : 263387 : &info->number_of_iterationsm1);
1585 : 263387 : if (info->conds.is_empty ())
1586 : 36 : return opt_result::failure_at
1587 : 36 : (vect_location,
1588 : : "not vectorized: complicated exit condition.\n");
1589 : :
1590 : : /* Determine what the primary and alternate exit conds are. */
1591 : 625830 : for (unsigned i = 0; i < info->conds.length (); i++)
1592 : : {
1593 : 362479 : gcond *cond = info->conds[i];
1594 : 362479 : if (exit_e->src == gimple_bb (cond))
1595 : 263351 : std::swap (info->conds[0], info->conds[i]);
1596 : : }
1597 : :
1598 : 263351 : if (integer_zerop (info->assumptions)
1599 : 263351 : || !info->number_of_iterations
1600 : 526702 : || chrec_contains_undetermined (info->number_of_iterations))
1601 : 40840 : return opt_result::failure_at
1602 : 40840 : (info->conds[0],
1603 : : "not vectorized: number of iterations cannot be computed.\n");
1604 : :
1605 : 222511 : if (integer_zerop (info->number_of_iterations))
1606 : 14 : return opt_result::failure_at
1607 : 14 : (info->conds[0],
1608 : : "not vectorized: number of iterations = 0.\n");
1609 : :
1610 : 222497 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1611 : 119922 : && tree_to_shwi (info->number_of_iterations) > 0))
1612 : : {
1613 : 102575 : if (dump_enabled_p ())
1614 : : {
1615 : 2323 : dump_printf_loc (MSG_NOTE, vect_location,
1616 : : "Symbolic number of iterations is ");
1617 : 2323 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1618 : 2323 : dump_printf (MSG_NOTE, "\n");
1619 : : }
1620 : : }
1621 : :
1622 : 222497 : if (!integer_onep (info->assumptions))
1623 : : {
1624 : 10820 : if (dump_enabled_p ())
1625 : : {
1626 : 65 : dump_printf_loc (MSG_NOTE, vect_location,
1627 : : "Loop to be versioned with niter assumption ");
1628 : 65 : dump_generic_expr (MSG_NOTE, TDF_SLIM, info->assumptions);
1629 : 65 : dump_printf (MSG_NOTE, "\n");
1630 : : }
1631 : : }
1632 : :
1633 : 222497 : return opt_result::success ();
1634 : 263431 : }
1635 : :
1636 : : /* Create a loop_vec_info for LOOP with SHARED and the
1637 : : vect_analyze_loop_form result. */
1638 : :
1639 : : loop_vec_info
1640 : 411411 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1641 : : const vect_loop_form_info *info,
1642 : : loop_vec_info orig_loop_info)
1643 : : {
1644 : 411411 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1645 : 411411 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1646 : 411411 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1647 : 411411 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1648 : 411411 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1649 : 411411 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1650 : 166 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1651 : 166 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1652 : : else
1653 : 411245 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1654 : : /* Also record the assumptions for versioning. */
1655 : 411411 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1656 : 21743 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1657 : :
1658 : 1864196 : for (gcond *cond : info->conds)
1659 : : {
1660 : 629963 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1661 : : /* Mark the statement as a condition. */
1662 : 629963 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1663 : : }
1664 : :
1665 : 629963 : for (unsigned i = 1; i < info->conds.length (); i ++)
1666 : 218552 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1667 : 411411 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1668 : :
1669 : 411411 : LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1670 : :
1671 : : /* Check to see if we're vectorizing multiple exits. */
1672 : 411411 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1673 : 411411 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1674 : :
1675 : 411411 : if (info->inner_loop_cond)
1676 : : {
1677 : : /* If we have an estimate on the number of iterations of the inner
1678 : : loop use that to limit the scale for costing, otherwise use
1679 : : --param vect-inner-loop-cost-factor literally. */
1680 : 8417 : widest_int nit;
1681 : 8417 : if (estimated_stmt_executions (loop->inner, &nit))
1682 : 7180 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1683 : 7180 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1684 : 8417 : }
1685 : :
1686 : 411411 : return loop_vinfo;
1687 : : }
1688 : :
1689 : :
1690 : :
1691 : : /* Return true if we know that the iteration count is smaller than the
1692 : : vectorization factor. Return false if it isn't, or if we can't be sure
1693 : : either way. */
1694 : :
1695 : : static bool
1696 : 109485 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1697 : : {
1698 : 109485 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1699 : :
1700 : 109485 : HOST_WIDE_INT max_niter;
1701 : 109485 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1702 : 52755 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1703 : : else
1704 : 56730 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1705 : :
1706 : 109485 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1707 : 9847 : return true;
1708 : :
1709 : : return false;
1710 : : }
1711 : :
1712 : : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1713 : : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1714 : : definitely no, or -1 if it's worth retrying. */
1715 : :
1716 : : static int
1717 : 109493 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1718 : : unsigned *suggested_unroll_factor)
1719 : : {
1720 : 109493 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1721 : 109493 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1722 : :
1723 : : /* Only loops that can handle partially-populated vectors can have iteration
1724 : : counts less than the vectorization factor. */
1725 : 109493 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
1726 : 109493 : && vect_known_niters_smaller_than_vf (loop_vinfo))
1727 : : {
1728 : 9837 : if (dump_enabled_p ())
1729 : 232 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730 : : "not vectorized: iteration count smaller than "
1731 : : "vectorization factor.\n");
1732 : 9837 : return 0;
1733 : : }
1734 : :
1735 : : /* If we know the number of iterations we can do better, for the
1736 : : epilogue we can also decide whether the main loop leaves us
1737 : : with enough iterations, prefering a smaller vector epilog then
1738 : : also possibly used for the case we skip the vector loop. */
1739 : 99656 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1740 : : {
1741 : 44036 : widest_int scalar_niters
1742 : 44036 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
1743 : 44036 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1744 : : {
1745 : 2637 : loop_vec_info orig_loop_vinfo
1746 : : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1747 : 2637 : loop_vec_info main_loop_vinfo
1748 : : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
1749 : 2637 : unsigned lowest_vf
1750 : 2637 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
1751 : 2637 : int prolog_peeling = 0;
1752 : 2637 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
1753 : 2637 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
1754 : 2637 : if (prolog_peeling >= 0
1755 : 2637 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
1756 : : lowest_vf))
1757 : : {
1758 : 5264 : unsigned gap
1759 : 2632 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
1760 : 5264 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
1761 : 5264 : % lowest_vf + gap);
1762 : : }
1763 : : }
1764 : : /* Reject vectorizing for a single scalar iteration, even if
1765 : : we could in principle implement that using partial vectors. */
1766 : 44036 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
1767 : 44036 : if (scalar_niters <= peeling_gap + 1)
1768 : : {
1769 : 786 : if (dump_enabled_p ())
1770 : 168 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1771 : : "not vectorized: loop only has a single "
1772 : : "scalar iteration.\n");
1773 : 786 : return 0;
1774 : : }
1775 : :
1776 : 43250 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1777 : : {
1778 : : /* Check that the loop processes at least one full vector. */
1779 : 43239 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1780 : 43239 : if (known_lt (scalar_niters, vf))
1781 : : {
1782 : 361 : if (dump_enabled_p ())
1783 : 293 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1784 : : "loop does not have enough iterations "
1785 : : "to support vectorization.\n");
1786 : 401 : return 0;
1787 : : }
1788 : :
1789 : : /* If we need to peel an extra epilogue iteration to handle data
1790 : : accesses with gaps, check that there are enough scalar iterations
1791 : : available.
1792 : :
1793 : : The check above is redundant with this one when peeling for gaps,
1794 : : but the distinction is useful for diagnostics. */
1795 : 42878 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1796 : 43165 : && known_le (scalar_niters, vf))
1797 : : {
1798 : 40 : if (dump_enabled_p ())
1799 : 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1800 : : "loop does not have enough iterations "
1801 : : "to support peeling for gaps.\n");
1802 : 40 : return 0;
1803 : : }
1804 : : }
1805 : 44036 : }
1806 : :
1807 : : /* If using the "very cheap" model. reject cases in which we'd keep
1808 : : a copy of the scalar code (even if we might be able to vectorize it). */
1809 : 98469 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1810 : 98469 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1811 : 49132 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
1812 : : {
1813 : 710 : if (dump_enabled_p ())
1814 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1815 : : "some scalar iterations would need to be peeled\n");
1816 : 710 : return 0;
1817 : : }
1818 : :
1819 : 97759 : int min_profitable_iters, min_profitable_estimate;
1820 : 97759 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1821 : : &min_profitable_estimate,
1822 : : suggested_unroll_factor);
1823 : :
1824 : 97759 : if (min_profitable_iters < 0)
1825 : : {
1826 : 25385 : if (dump_enabled_p ())
1827 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828 : : "not vectorized: vectorization not profitable.\n");
1829 : 25385 : if (dump_enabled_p ())
1830 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1831 : : "not vectorized: vector version will never be "
1832 : : "profitable.\n");
1833 : 25385 : return -1;
1834 : : }
1835 : :
1836 : 72374 : int min_scalar_loop_bound = (param_min_vect_loop_bound
1837 : 72374 : * assumed_vf);
1838 : :
1839 : : /* Use the cost model only if it is more conservative than user specified
1840 : : threshold. */
1841 : 72374 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1842 : : min_profitable_iters);
1843 : :
1844 : 72374 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1845 : :
1846 : 36621 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1847 : 108995 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1848 : : {
1849 : 399 : if (dump_enabled_p ())
1850 : 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1851 : : "not vectorized: vectorization not profitable.\n");
1852 : 399 : if (dump_enabled_p ())
1853 : 1 : dump_printf_loc (MSG_NOTE, vect_location,
1854 : : "not vectorized: iteration count smaller than user "
1855 : : "specified loop bound parameter or minimum profitable "
1856 : : "iterations (whichever is more conservative).\n");
1857 : 399 : return 0;
1858 : : }
1859 : :
1860 : : /* The static profitablity threshold min_profitable_estimate includes
1861 : : the cost of having to check at runtime whether the scalar loop
1862 : : should be used instead. If it turns out that we don't need or want
1863 : : such a check, the threshold we should use for the static estimate
1864 : : is simply the point at which the vector loop becomes more profitable
1865 : : than the scalar loop. */
1866 : 71975 : if (min_profitable_estimate > min_profitable_iters
1867 : 15327 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1868 : 14865 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1869 : 262 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1870 : 72237 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1871 : : {
1872 : 8 : if (dump_enabled_p ())
1873 : 3 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1874 : : " choice between the scalar and vector loops\n");
1875 : 8 : min_profitable_estimate = min_profitable_iters;
1876 : : }
1877 : :
1878 : : /* If the vector loop needs multiple iterations to be beneficial then
1879 : : things are probably too close to call, and the conservative thing
1880 : : would be to stick with the scalar code. */
1881 : 71975 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1882 : 71975 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1883 : : {
1884 : 8299 : if (dump_enabled_p ())
1885 : 177 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1886 : : "one iteration of the vector loop would be"
1887 : : " more expensive than the equivalent number of"
1888 : : " iterations of the scalar loop\n");
1889 : 8299 : return 0;
1890 : : }
1891 : :
1892 : 63676 : HOST_WIDE_INT estimated_niter;
1893 : :
1894 : : /* If we are vectorizing an epilogue then we know the maximum number of
1895 : : scalar iterations it will cover is at least one lower than the
1896 : : vectorization factor of the main loop. */
1897 : 63676 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1898 : 10557 : estimated_niter
1899 : 10557 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1900 : : else
1901 : : {
1902 : 53119 : estimated_niter = estimated_stmt_executions_int (loop);
1903 : 53119 : if (estimated_niter == -1)
1904 : 20206 : estimated_niter = likely_max_stmt_executions_int (loop);
1905 : : }
1906 : 30763 : if (estimated_niter != -1
1907 : 62066 : && ((unsigned HOST_WIDE_INT) estimated_niter
1908 : 62066 : < MAX (th, (unsigned) min_profitable_estimate)))
1909 : : {
1910 : 4260 : if (dump_enabled_p ())
1911 : 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1912 : : "not vectorized: estimated iteration count too "
1913 : : "small.\n");
1914 : 4260 : if (dump_enabled_p ())
1915 : 28 : dump_printf_loc (MSG_NOTE, vect_location,
1916 : : "not vectorized: estimated iteration count smaller "
1917 : : "than specified loop bound parameter or minimum "
1918 : : "profitable iterations (whichever is more "
1919 : : "conservative).\n");
1920 : 4260 : return -1;
1921 : : }
1922 : :
1923 : : return 1;
1924 : : }
1925 : :
1926 : : static opt_result
1927 : 220067 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1928 : : vec<data_reference_p> *datarefs)
1929 : : {
1930 : 674020 : for (unsigned i = 0; i < loop->num_nodes; i++)
1931 : 996902 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1932 : 3753816 : !gsi_end_p (gsi); gsi_next (&gsi))
1933 : : {
1934 : 3299863 : gimple *stmt = gsi_stmt (gsi);
1935 : 3299863 : if (is_gimple_debug (stmt))
1936 : 1214301 : continue;
1937 : 2085690 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1938 : : NULL, 0);
1939 : 2085690 : if (!res)
1940 : : {
1941 : 44626 : if (is_gimple_call (stmt) && loop->safelen)
1942 : : {
1943 : 399 : tree fndecl = gimple_call_fndecl (stmt), op;
1944 : 399 : if (fndecl == NULL_TREE
1945 : 399 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
1946 : : {
1947 : 0 : fndecl = gimple_call_arg (stmt, 0);
1948 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
1949 : 0 : fndecl = TREE_OPERAND (fndecl, 0);
1950 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
1951 : : }
1952 : 399 : if (fndecl != NULL_TREE)
1953 : : {
1954 : 362 : cgraph_node *node = cgraph_node::get (fndecl);
1955 : 362 : if (node != NULL && node->simd_clones != NULL)
1956 : : {
1957 : 129 : unsigned int j, n = gimple_call_num_args (stmt);
1958 : 539 : for (j = 0; j < n; j++)
1959 : : {
1960 : 282 : op = gimple_call_arg (stmt, j);
1961 : 282 : if (DECL_P (op)
1962 : 282 : || (REFERENCE_CLASS_P (op)
1963 : 0 : && get_base_address (op)))
1964 : : break;
1965 : : }
1966 : 129 : op = gimple_call_lhs (stmt);
1967 : : /* Ignore #pragma omp declare simd functions
1968 : : if they don't have data references in the
1969 : : call stmt itself. */
1970 : 257 : if (j == n
1971 : 129 : && !(op
1972 : 118 : && (DECL_P (op)
1973 : 118 : || (REFERENCE_CLASS_P (op)
1974 : 0 : && get_base_address (op)))))
1975 : 128 : continue;
1976 : : }
1977 : : }
1978 : : }
1979 : 44498 : return res;
1980 : : }
1981 : : /* If dependence analysis will give up due to the limit on the
1982 : : number of datarefs stop here and fail fatally. */
1983 : 3581377 : if (datarefs->length ()
1984 : 1540313 : > (unsigned)param_loop_max_datarefs_for_datadeps)
1985 : 0 : return opt_result::failure_at (stmt, "exceeded param "
1986 : : "loop-max-datarefs-for-datadeps\n");
1987 : : }
1988 : 175569 : return opt_result::success ();
1989 : : }
1990 : :
1991 : : /* Determine if operating on full vectors for LOOP_VINFO might leave
1992 : : some scalar iterations still to do. If so, decide how we should
1993 : : handle those scalar iterations. The possibilities are:
1994 : :
1995 : : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
1996 : : In this case:
1997 : :
1998 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
1999 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2000 : : LOOP_VINFO_PEELING_FOR_NITER == false
2001 : :
2002 : : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2003 : : to handle the remaining scalar iterations. In this case:
2004 : :
2005 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2006 : : LOOP_VINFO_PEELING_FOR_NITER == true
2007 : :
2008 : : There are two choices:
2009 : :
2010 : : (2a) Consider vectorizing the epilogue loop at the same VF as the
2011 : : main loop, but using partial vectors instead of full vectors.
2012 : : In this case:
2013 : :
2014 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2015 : :
2016 : : (2b) Consider vectorizing the epilogue loop at lower VFs only.
2017 : : In this case:
2018 : :
2019 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2020 : : */
2021 : :
2022 : : opt_result
2023 : 117060 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2024 : : {
2025 : : /* Determine whether there would be any scalar iterations left over. */
2026 : 117060 : bool need_peeling_or_partial_vectors_p
2027 : 117060 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2028 : :
2029 : : /* Decide whether to vectorize the loop with partial vectors. */
2030 : 117060 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2031 : 117060 : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2032 : 117060 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2033 : 37 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2034 : 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2035 : 117060 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2036 : 37 : && need_peeling_or_partial_vectors_p)
2037 : : {
2038 : : /* For partial-vector-usage=1, try to push the handling of partial
2039 : : vectors to the epilogue, with the main loop continuing to operate
2040 : : on full vectors.
2041 : :
2042 : : If we are unrolling we also do not want to use partial vectors. This
2043 : : is to avoid the overhead of generating multiple masks and also to
2044 : : avoid having to execute entire iterations of FALSE masked instructions
2045 : : when dealing with one or less full iterations.
2046 : :
2047 : : ??? We could then end up failing to use partial vectors if we
2048 : : decide to peel iterations into a prologue, and if the main loop
2049 : : then ends up processing fewer than VF iterations. */
2050 : 32 : if ((param_vect_partial_vector_usage == 1
2051 : 10 : || loop_vinfo->suggested_unroll_factor > 1)
2052 : 22 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2053 : 46 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2054 : 4 : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2055 : : else
2056 : 28 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2057 : : }
2058 : :
2059 : 117060 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2060 : 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2061 : 0 : return opt_result::failure_at (vect_location,
2062 : : "not vectorized: loop needs but cannot "
2063 : : "use partial vectors\n");
2064 : :
2065 : 117060 : if (dump_enabled_p ())
2066 : 12668 : dump_printf_loc (MSG_NOTE, vect_location,
2067 : : "operating on %s vectors%s.\n",
2068 : 12668 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2069 : : ? "partial" : "full",
2070 : 12668 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2071 : : ? " for epilogue loop" : "");
2072 : :
2073 : 117060 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2074 : 234120 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2075 : 117060 : && need_peeling_or_partial_vectors_p);
2076 : :
2077 : : /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2078 : : analysis that we don't know whether the loop is vectorized by partial
2079 : : vectors (More details see tree-vect-loop-manip.cc).
2080 : :
2081 : : However, SELECT_VL vectorizaton style should only applied on partial
2082 : : vectorization since SELECT_VL is the GIMPLE IR that calculates the
2083 : : number of elements to be process for each iteration.
2084 : :
2085 : : After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2086 : : if it is not partial vectorized loop. */
2087 : 117060 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2088 : 117032 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2089 : :
2090 : 117060 : return opt_result::success ();
2091 : : }
2092 : :
2093 : : /* Function vect_analyze_loop_2.
2094 : :
2095 : : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2096 : : analyses will record information in some members of LOOP_VINFO. FATAL
2097 : : indicates if some analysis meets fatal error. If one non-NULL pointer
2098 : : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2099 : : worked out suggested unroll factor, while one NULL pointer shows it's
2100 : : going to apply the suggested unroll factor.
2101 : : SINGLE_LANE_SLP_DONE_FOR_SUGGESTED_UF is to hold whether single-lane
2102 : : slp was forced when the suggested unroll factor was worked out. */
2103 : : static opt_result
2104 : 410711 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2105 : : unsigned *suggested_unroll_factor,
2106 : : bool& single_lane_slp_done_for_suggested_uf)
2107 : : {
2108 : 410711 : opt_result ok = opt_result::success ();
2109 : 410711 : int res;
2110 : 410711 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2111 : 410711 : loop_vec_info orig_loop_vinfo = NULL;
2112 : :
2113 : : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2114 : : loop_vec_info of the first vectorized loop. */
2115 : 410711 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2116 : 17508 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2117 : : else
2118 : : orig_loop_vinfo = loop_vinfo;
2119 : 17508 : gcc_assert (orig_loop_vinfo);
2120 : :
2121 : : /* The first group of checks is independent of the vector size. */
2122 : 410711 : fatal = true;
2123 : :
2124 : 410711 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2125 : 410711 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2126 : 5 : return opt_result::failure_at (vect_location,
2127 : : "not vectorized: simd if(0)\n");
2128 : :
2129 : : /* Find all data references in the loop (which correspond to vdefs/vuses)
2130 : : and analyze their evolution in the loop. */
2131 : :
2132 : 410706 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2133 : :
2134 : : /* Gather the data references and count stmts in the loop. */
2135 : 410706 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2136 : : {
2137 : 220067 : opt_result res
2138 : 220067 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2139 : : &LOOP_VINFO_DATAREFS (loop_vinfo));
2140 : 220067 : if (!res)
2141 : : {
2142 : 44498 : if (dump_enabled_p ())
2143 : 1465 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144 : : "not vectorized: loop contains function "
2145 : : "calls or data references that cannot "
2146 : : "be analyzed\n");
2147 : 44498 : return res;
2148 : : }
2149 : 175569 : loop_vinfo->shared->save_datarefs ();
2150 : : }
2151 : : else
2152 : 190639 : loop_vinfo->shared->check_datarefs ();
2153 : :
2154 : : /* Analyze the data references and also adjust the minimal
2155 : : vectorization factor according to the loads and stores. */
2156 : :
2157 : 366208 : ok = vect_analyze_data_refs (loop_vinfo, &fatal);
2158 : 366208 : if (!ok)
2159 : : {
2160 : 49819 : if (dump_enabled_p ())
2161 : 972 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162 : : "bad data references.\n");
2163 : 49819 : return ok;
2164 : : }
2165 : :
2166 : : /* Check if we are applying unroll factor now. */
2167 : 316389 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2168 : 316389 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2169 : :
2170 : : /* When single-lane SLP was forced and we are applying suggested unroll
2171 : : factor, keep that decision here. */
2172 : 632778 : bool force_single_lane = (applying_suggested_uf
2173 : 316389 : && single_lane_slp_done_for_suggested_uf);
2174 : :
2175 : : /* Classify all cross-iteration scalar data-flow cycles.
2176 : : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2177 : 316389 : vect_analyze_scalar_cycles (loop_vinfo);
2178 : :
2179 : 316389 : vect_pattern_recog (loop_vinfo);
2180 : :
2181 : : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2182 : : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2183 : :
2184 : 316389 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2185 : 316389 : if (!ok)
2186 : : {
2187 : 6767 : if (dump_enabled_p ())
2188 : 262 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2189 : : "bad data access.\n");
2190 : 6767 : return ok;
2191 : : }
2192 : :
2193 : : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2194 : :
2195 : 309622 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2196 : 309622 : if (!ok)
2197 : : {
2198 : 13531 : if (dump_enabled_p ())
2199 : 304 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2200 : : "unexpected pattern.\n");
2201 : 13531 : return ok;
2202 : : }
2203 : :
2204 : : /* While the rest of the analysis below depends on it in some way. */
2205 : 296091 : fatal = false;
2206 : :
2207 : : /* Analyze data dependences between the data-refs in the loop
2208 : : and adjust the maximum vectorization factor according to
2209 : : the dependences.
2210 : : FORNOW: fail at the first data dependence that we encounter. */
2211 : :
2212 : 296091 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2213 : 296091 : if (!ok)
2214 : : {
2215 : 14326 : if (dump_enabled_p ())
2216 : 372 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2217 : : "bad data dependence.\n");
2218 : 14326 : return ok;
2219 : : }
2220 : 281765 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2221 : :
2222 : : /* Compute the scalar iteration cost. */
2223 : 281765 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2224 : :
2225 : 281765 : bool saved_can_use_partial_vectors_p
2226 : : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2227 : :
2228 : : /* This is the point where we can re-start analysis with single-lane
2229 : : SLP forced. */
2230 : 401909 : start_over:
2231 : :
2232 : : /* Check the SLP opportunities in the loop, analyze and build
2233 : : SLP trees. */
2234 : 803818 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2235 : : force_single_lane);
2236 : 401909 : if (!ok)
2237 : 26498 : return ok;
2238 : :
2239 : : /* If there are any SLP instances mark them as pure_slp. */
2240 : 375411 : if (!vect_make_slp_decision (loop_vinfo))
2241 : 38460 : return opt_result::failure_at (vect_location, "no stmts to vectorize.\n");
2242 : :
2243 : 336951 : if (dump_enabled_p ())
2244 : 17667 : dump_printf_loc (MSG_NOTE, vect_location, "Loop contains only SLP stmts\n");
2245 : :
2246 : : /* Determine the vectorization factor from the SLP decision. */
2247 : 336951 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2248 : 336951 : = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2249 : 336951 : if (dump_enabled_p ())
2250 : : {
2251 : 17667 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
2252 : 17667 : dump_dec (MSG_NOTE, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2253 : 17667 : dump_printf (MSG_NOTE, "\n");
2254 : : }
2255 : :
2256 : : /* Optimize the SLP graph with the vectorization factor fixed. */
2257 : 336951 : vect_optimize_slp (loop_vinfo);
2258 : :
2259 : : /* Gather the loads reachable from the SLP graph entries. */
2260 : 336951 : vect_gather_slp_loads (loop_vinfo);
2261 : :
2262 : : /* We don't expect to have to roll back to anything other than an empty
2263 : : set of rgroups. */
2264 : 336951 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2265 : :
2266 : : /* Apply the suggested unrolling factor, this was determined by the backend
2267 : : during finish_cost the first time we ran the analyzis for this
2268 : : vector mode. */
2269 : 336951 : if (applying_suggested_uf)
2270 : 240 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2271 : :
2272 : : /* Now the vectorization factor is final. */
2273 : 336951 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2274 : 336951 : gcc_assert (known_ne (vectorization_factor, 0U));
2275 : :
2276 : 336951 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2277 : : {
2278 : 13478 : dump_printf_loc (MSG_NOTE, vect_location,
2279 : : "vectorization_factor = ");
2280 : 13478 : dump_dec (MSG_NOTE, vectorization_factor);
2281 : 13478 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2282 : 13478 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2283 : : }
2284 : :
2285 : 336951 : if (max_vf != MAX_VECTORIZATION_FACTOR
2286 : 336951 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2287 : 41 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2288 : :
2289 : 336910 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2290 : :
2291 : : /* Analyze the alignment of the data-refs in the loop.
2292 : : Fail if a data reference is found that cannot be vectorized. */
2293 : :
2294 : 336910 : ok = vect_analyze_data_refs_alignment (loop_vinfo);
2295 : 336910 : if (!ok)
2296 : : {
2297 : 0 : if (dump_enabled_p ())
2298 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2299 : : "bad data alignment.\n");
2300 : 0 : return ok;
2301 : : }
2302 : :
2303 : : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2304 : : It is important to call pruning after vect_analyze_data_ref_accesses,
2305 : : since we use grouping information gathered by interleaving analysis. */
2306 : 336910 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2307 : 336910 : if (!ok)
2308 : 16256 : return ok;
2309 : :
2310 : : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2311 : : vectorization, since we do not want to add extra peeling or
2312 : : add versioning for alignment. */
2313 : 320654 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2314 : : /* This pass will decide on using loop versioning and/or loop peeling in
2315 : : order to enhance the alignment of data references in the loop. */
2316 : 306114 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2317 : 320654 : if (!ok)
2318 : 0 : return ok;
2319 : :
2320 : : /* Analyze operations in the SLP instances. We can't simply
2321 : : remove unsupported SLP instances as this makes the above
2322 : : SLP kind detection invalid and might also affect the VF. */
2323 : 320654 : if (! vect_slp_analyze_operations (loop_vinfo))
2324 : : {
2325 : 210360 : ok = opt_result::failure_at (vect_location,
2326 : : "unsupported SLP instances\n");
2327 : 210360 : goto again;
2328 : : }
2329 : :
2330 : : /* For now, we don't expect to mix both masking and length approaches for one
2331 : : loop, disable it if both are recorded. */
2332 : 110294 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2333 : 31 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2334 : 110325 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2335 : : {
2336 : 0 : if (dump_enabled_p ())
2337 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2338 : : "can't vectorize a loop with partial vectors"
2339 : : " because we don't expect to mix different"
2340 : : " approaches with partial vectors for the"
2341 : : " same loop.\n");
2342 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2343 : : }
2344 : :
2345 : : /* If we still have the option of using partial vectors,
2346 : : check whether we can generate the necessary loop controls. */
2347 : 110294 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2348 : : {
2349 : 31 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2350 : : {
2351 : 31 : if (!vect_verify_full_masking (loop_vinfo)
2352 : 31 : && !vect_verify_full_masking_avx512 (loop_vinfo))
2353 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2354 : : }
2355 : : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2356 : 0 : if (!vect_verify_loop_lens (loop_vinfo))
2357 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2358 : : }
2359 : :
2360 : : /* If we're vectorizing a loop that uses length "controls" and
2361 : : can iterate more than once, we apply decrementing IV approach
2362 : : in loop control. */
2363 : 110294 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2364 : 31 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2365 : 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2366 : 110294 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2367 : 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2368 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2369 : 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2370 : :
2371 : : /* If a loop uses length controls and has a decrementing loop control IV,
2372 : : we will normally pass that IV through a MIN_EXPR to calcaluate the
2373 : : basis for the length controls. E.g. in a loop that processes one
2374 : : element per scalar iteration, the number of elements would be
2375 : : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2376 : :
2377 : : This MIN_EXPR approach allows us to use pointer IVs with an invariant
2378 : : step, since only the final iteration of the vector loop can have
2379 : : inactive lanes.
2380 : :
2381 : : However, some targets have a dedicated instruction for calculating the
2382 : : preferred length, given the total number of elements that still need to
2383 : : be processed. This is encapsulated in the SELECT_VL internal function.
2384 : :
2385 : : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2386 : : to determine the basis for the length controls. However, unlike the
2387 : : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2388 : : lanes inactive in any iteration of the vector loop, not just the last
2389 : : iteration. This SELECT_VL approach therefore requires us to use pointer
2390 : : IVs with variable steps.
2391 : :
2392 : : Once we've decided how many elements should be processed by one
2393 : : iteration of the vector loop, we need to populate the rgroup controls.
2394 : : If a loop has multiple rgroups, we need to make sure that those rgroups
2395 : : "line up" (that is, they must be consistent about which elements are
2396 : : active and which aren't). This is done by vect_adjust_loop_lens_control.
2397 : :
2398 : : In principle, it would be possible to use vect_adjust_loop_lens_control
2399 : : on either the result of a MIN_EXPR or the result of a SELECT_VL.
2400 : : However:
2401 : :
2402 : : (1) In practice, it only makes sense to use SELECT_VL when a vector
2403 : : operation will be controlled directly by the result. It is not
2404 : : worth using SELECT_VL if it would only be the input to other
2405 : : calculations.
2406 : :
2407 : : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2408 : : pointer IV will need N updates by a variable amount (N-1 updates
2409 : : within the iteration and 1 update to move to the next iteration).
2410 : :
2411 : : Because of this, we prefer to use the MIN_EXPR approach whenever there
2412 : : is more than one length control.
2413 : :
2414 : : In addition, SELECT_VL always operates to a granularity of 1 unit.
2415 : : If we wanted to use it to control an SLP operation on N consecutive
2416 : : elements, we would need to make the SELECT_VL inputs measure scalar
2417 : : iterations (rather than elements) and then multiply the SELECT_VL
2418 : : result by N. But using SELECT_VL this way is inefficient because
2419 : : of (1) above.
2420 : :
2421 : : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2422 : : satisfied:
2423 : :
2424 : : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2425 : : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2426 : :
2427 : : Since SELECT_VL (variable step) will make SCEV analysis failed and then
2428 : : we will fail to gain benefits of following unroll optimizations. We prefer
2429 : : using the MIN_EXPR approach in this situation. */
2430 : 110294 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2431 : : {
2432 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2433 : 0 : if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
2434 : : OPTIMIZE_FOR_SPEED)
2435 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () == 1
2436 : 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
2437 : 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2438 : : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2439 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2440 : :
2441 : : /* If any of the SLP instances cover more than a single lane
2442 : : we cannot use .SELECT_VL at the moment, even if the number
2443 : : of lanes is uniform throughout the SLP graph. */
2444 : 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2445 : 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
2446 : 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
2447 : 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
2448 : 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
2449 : : {
2450 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2451 : 0 : break;
2452 : : }
2453 : : }
2454 : :
2455 : : /* Decide whether this loop_vinfo should use partial vectors or peeling,
2456 : : assuming that the loop will be used as a main loop. We will redo
2457 : : this analysis later if we instead decide to use the loop as an
2458 : : epilogue loop. */
2459 : 110294 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
2460 : 110294 : if (!ok)
2461 : 0 : return ok;
2462 : :
2463 : : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2464 : : to be able to handle fewer than VF scalars, or needs to have a lower VF
2465 : : than the main loop. */
2466 : 110294 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2467 : 12289 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2468 : : {
2469 : 12283 : poly_uint64 unscaled_vf
2470 : 12283 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2471 : : orig_loop_vinfo->suggested_unroll_factor);
2472 : 12283 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
2473 : 277 : return opt_result::failure_at (vect_location,
2474 : : "Vectorization factor too high for"
2475 : : " epilogue loop.\n");
2476 : : }
2477 : :
2478 : : /* If the epilogue needs peeling for gaps but the main loop doesn't give
2479 : : up on the epilogue. */
2480 : 110017 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2481 : 12012 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2482 : 58 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
2483 : : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2484 : 4 : return opt_result::failure_at (vect_location,
2485 : : "Epilogue loop requires peeling for gaps "
2486 : : "but main loop does not.\n");
2487 : :
2488 : : /* If an epilogue loop is required make sure we can create one. */
2489 : 110013 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2490 : 108807 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2491 : 32624 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
2492 : : {
2493 : 78421 : if (dump_enabled_p ())
2494 : 4955 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2495 : 78421 : if (!vect_can_advance_ivs_p (loop_vinfo)
2496 : 156322 : || !slpeel_can_duplicate_loop_p (loop,
2497 : : LOOP_VINFO_IV_EXIT (loop_vinfo),
2498 : 77901 : LOOP_VINFO_IV_EXIT (loop_vinfo)))
2499 : : {
2500 : 520 : ok = opt_result::failure_at (vect_location,
2501 : : "not vectorized: can't create required "
2502 : : "epilog loop\n");
2503 : 520 : goto again;
2504 : : }
2505 : : }
2506 : :
2507 : : /* Check the costings of the loop make vectorizing worthwhile. */
2508 : 109493 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2509 : 109493 : if (res < 0)
2510 : : {
2511 : 29645 : ok = opt_result::failure_at (vect_location,
2512 : : "Loop costings may not be worthwhile.\n");
2513 : 29645 : goto again;
2514 : : }
2515 : 79848 : if (!res)
2516 : 20432 : return opt_result::failure_at (vect_location,
2517 : : "Loop costings not worthwhile.\n");
2518 : :
2519 : : /* During peeling, we need to check if number of loop iterations is
2520 : : enough for both peeled prolog loop and vector loop. This check
2521 : : can be merged along with threshold check of loop versioning, so
2522 : : increase threshold for this case if necessary.
2523 : :
2524 : : If we are analyzing an epilogue we still want to check what its
2525 : : versioning threshold would be. If we decide to vectorize the epilogues we
2526 : : will want to use the lowest versioning threshold of all epilogues and main
2527 : : loop. This will enable us to enter a vectorized epilogue even when
2528 : : versioning the loop. We can't simply check whether the epilogue requires
2529 : : versioning though since we may have skipped some versioning checks when
2530 : : analyzing the epilogue. For instance, checks for alias versioning will be
2531 : : skipped when dealing with epilogues as we assume we already checked them
2532 : : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2533 : 59416 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2534 : : {
2535 : 5610 : poly_uint64 niters_th = 0;
2536 : 5610 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2537 : :
2538 : 5610 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2539 : : {
2540 : : /* Niters for peeled prolog loop. */
2541 : 5610 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2542 : : {
2543 : 125 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2544 : 125 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2545 : 125 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2546 : : }
2547 : : else
2548 : 5485 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2549 : : }
2550 : :
2551 : : /* Niters for at least one iteration of vectorized loop. */
2552 : 5610 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2553 : 5606 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2554 : : /* One additional iteration because of peeling for gap. */
2555 : 5610 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2556 : 55 : niters_th += 1;
2557 : :
2558 : : /* Use the same condition as vect_transform_loop to decide when to use
2559 : : the cost to determine a versioning threshold. */
2560 : 5610 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2561 : 5610 : && ordered_p (th, niters_th))
2562 : 3821 : niters_th = ordered_max (poly_uint64 (th), niters_th);
2563 : :
2564 : 5610 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2565 : : }
2566 : :
2567 : 59416 : gcc_assert (known_eq (vectorization_factor,
2568 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2569 : :
2570 : 59416 : single_lane_slp_done_for_suggested_uf = force_single_lane;
2571 : :
2572 : : /* Ok to vectorize! */
2573 : 59416 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2574 : 59416 : return opt_result::success ();
2575 : :
2576 : 240525 : again:
2577 : : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2578 : 240525 : gcc_assert (!ok);
2579 : :
2580 : : /* Try again with single-lane SLP. */
2581 : 240525 : if (force_single_lane)
2582 : 119316 : return ok;
2583 : :
2584 : : /* If we are applying suggested unroll factor, we don't need to
2585 : : re-try any more as we want to keep the SLP mode fixed. */
2586 : 121209 : if (applying_suggested_uf)
2587 : 6 : return ok;
2588 : :
2589 : : /* Likewise if the grouped loads or stores in the SLP cannot be handled
2590 : : via interleaving or lane instructions. */
2591 : : slp_instance instance;
2592 : : slp_tree node;
2593 : : unsigned i, j;
2594 : 477668 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2595 : : {
2596 : 357524 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
2597 : 0 : continue;
2598 : :
2599 : 357524 : stmt_vec_info vinfo;
2600 : 357524 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2601 : 357524 : if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2602 : 354911 : continue;
2603 : 2613 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2604 : 2613 : unsigned int size = DR_GROUP_SIZE (vinfo);
2605 : 2613 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
2606 : 2613 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
2607 : 4540 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2608 : 5218 : && ! vect_grouped_store_supported (vectype, size))
2609 : 678 : return opt_result::failure_at (vinfo->stmt,
2610 : : "unsupported grouped store\n");
2611 : 359782 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2612 : : {
2613 : 2100 : vinfo = SLP_TREE_REPRESENTATIVE (node);
2614 : 2100 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
2615 : : {
2616 : 1815 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2617 : 1815 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2618 : 1815 : size = DR_GROUP_SIZE (vinfo);
2619 : 1815 : vectype = SLP_TREE_VECTYPE (node);
2620 : 1815 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
2621 : 1815 : && ! vect_grouped_load_supported (vectype, single_element_p,
2622 : : size))
2623 : 381 : return opt_result::failure_at (vinfo->stmt,
2624 : : "unsupported grouped load\n");
2625 : : }
2626 : : }
2627 : : }
2628 : :
2629 : : /* Roll back state appropriately. Force single-lane SLP this time. */
2630 : 120144 : force_single_lane = true;
2631 : 120144 : if (dump_enabled_p ())
2632 : 3253 : dump_printf_loc (MSG_NOTE, vect_location,
2633 : : "re-trying with single-lane SLP\n");
2634 : :
2635 : : /* Reset the vectorization factor. */
2636 : 120144 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = 0;
2637 : : /* Free the SLP instances. */
2638 : 476593 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2639 : 356449 : vect_free_slp_instance (instance);
2640 : 120144 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2641 : : /* Reset SLP type to loop_vect on all stmts. */
2642 : 466968 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2643 : : {
2644 : 346824 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2645 : 346824 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
2646 : 616990 : !gsi_end_p (si); gsi_next (&si))
2647 : : {
2648 : 270166 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2649 : 270166 : STMT_SLP_TYPE (stmt_info) = not_vect;
2650 : 270166 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2651 : 270166 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2652 : : {
2653 : : /* vectorizable_reduction adjusts reduction stmt def-types,
2654 : : restore them to that of the PHI. */
2655 : 16466 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2656 : 16466 : = STMT_VINFO_DEF_TYPE (stmt_info);
2657 : 16466 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2658 : : (STMT_VINFO_REDUC_DEF (stmt_info)))
2659 : 16466 : = STMT_VINFO_DEF_TYPE (stmt_info);
2660 : : }
2661 : : }
2662 : 693648 : for (gimple_stmt_iterator si = gsi_start_bb (bb);
2663 : 2088410 : !gsi_end_p (si); gsi_next (&si))
2664 : : {
2665 : 1741586 : if (is_gimple_debug (gsi_stmt (si)))
2666 : 633229 : continue;
2667 : 1108357 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2668 : 1108357 : STMT_SLP_TYPE (stmt_info) = not_vect;
2669 : 1108357 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2670 : : {
2671 : 209872 : stmt_vec_info pattern_stmt_info
2672 : : = STMT_VINFO_RELATED_STMT (stmt_info);
2673 : 209872 : if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2674 : 0 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2675 : :
2676 : 209872 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2677 : 209872 : STMT_SLP_TYPE (pattern_stmt_info) = not_vect;
2678 : 209872 : for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2679 : 426625 : !gsi_end_p (pi); gsi_next (&pi))
2680 : 216753 : STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2681 : 216753 : = not_vect;
2682 : : }
2683 : : }
2684 : : }
2685 : : /* Free optimized alias test DDRS. */
2686 : 120144 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2687 : 120144 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2688 : 120144 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2689 : : /* Reset target cost data. */
2690 : 120144 : delete loop_vinfo->vector_costs;
2691 : 120144 : loop_vinfo->vector_costs = nullptr;
2692 : : /* Reset accumulated rgroup information. */
2693 : 120144 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
2694 : 120144 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
2695 : 120144 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2696 : : /* Reset assorted flags. */
2697 : 120144 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2698 : 120144 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2699 : 120144 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2700 : 120144 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2701 : 120144 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2702 : 120144 : = saved_can_use_partial_vectors_p;
2703 : 120144 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2704 : 120144 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2705 : 120144 : if (loop_vinfo->scan_map)
2706 : 122 : loop_vinfo->scan_map->empty ();
2707 : :
2708 : 120144 : goto start_over;
2709 : : }
2710 : :
2711 : : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2712 : : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2713 : : OLD_LOOP_VINFO is better unless something specifically indicates
2714 : : otherwise.
2715 : :
2716 : : Note that this deliberately isn't a partial order. */
2717 : :
2718 : : static bool
2719 : 0 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2720 : : loop_vec_info old_loop_vinfo)
2721 : : {
2722 : 0 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2723 : 0 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2724 : :
2725 : 0 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2726 : 0 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2727 : :
2728 : : /* Always prefer a VF of loop->simdlen over any other VF. */
2729 : 0 : if (loop->simdlen)
2730 : : {
2731 : 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2732 : 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2733 : 0 : if (new_simdlen_p != old_simdlen_p)
2734 : : return new_simdlen_p;
2735 : : }
2736 : :
2737 : 0 : const auto *old_costs = old_loop_vinfo->vector_costs;
2738 : 0 : const auto *new_costs = new_loop_vinfo->vector_costs;
2739 : 0 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2740 : 0 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2741 : :
2742 : 0 : return new_costs->better_main_loop_than_p (old_costs);
2743 : : }
2744 : :
2745 : : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2746 : : true if we should. */
2747 : :
2748 : : static bool
2749 : 0 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2750 : : loop_vec_info old_loop_vinfo)
2751 : : {
2752 : 0 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2753 : : return false;
2754 : :
2755 : 0 : if (dump_enabled_p ())
2756 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2757 : : "***** Preferring vector mode %s to vector mode %s\n",
2758 : 0 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
2759 : 0 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
2760 : : return true;
2761 : : }
2762 : :
2763 : : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
2764 : : not NULL. When MASKED_P is not -1 override the default
2765 : : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P with it.
2766 : : Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance MODE_I to the next
2767 : : mode useful to analyze.
2768 : : Return the loop_vinfo on success and wrapped null on failure. */
2769 : :
2770 : : static opt_loop_vec_info
2771 : 410471 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2772 : : const vect_loop_form_info *loop_form_info,
2773 : : loop_vec_info orig_loop_vinfo,
2774 : : const vector_modes &vector_modes, unsigned &mode_i,
2775 : : int masked_p,
2776 : : machine_mode &autodetected_vector_mode,
2777 : : bool &fatal)
2778 : : {
2779 : 410471 : loop_vec_info loop_vinfo
2780 : 410471 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
2781 : :
2782 : 410471 : machine_mode vector_mode = vector_modes[mode_i];
2783 : 410471 : loop_vinfo->vector_mode = vector_mode;
2784 : 410471 : if (masked_p != -1)
2785 : 4 : loop_vinfo->can_use_partial_vectors_p = masked_p;
2786 : 410471 : unsigned int suggested_unroll_factor = 1;
2787 : 410471 : bool single_lane_slp_done_for_suggested_uf = false;
2788 : :
2789 : : /* Run the main analysis. */
2790 : 410471 : opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
2791 : : &suggested_unroll_factor,
2792 : : single_lane_slp_done_for_suggested_uf);
2793 : 410471 : if (dump_enabled_p ())
2794 : 19078 : dump_printf_loc (MSG_NOTE, vect_location,
2795 : : "***** Analysis %s with vector mode %s\n",
2796 : 19078 : res ? "succeeded" : "failed",
2797 : 19078 : GET_MODE_NAME (loop_vinfo->vector_mode));
2798 : :
2799 : 410471 : auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
2800 : 410471 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2801 : : /* Check to see if the user wants to unroll or if the target wants to. */
2802 : 462927 : && (suggested_unroll_factor > 1 || user_unroll > 1))
2803 : : {
2804 : 252 : if (suggested_unroll_factor == 1)
2805 : : {
2806 : 40 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
2807 : 40 : suggested_unroll_factor = user_unroll / assumed_vf;
2808 : 40 : if (suggested_unroll_factor > 1)
2809 : : {
2810 : 28 : if (dump_enabled_p ())
2811 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
2812 : : "setting unroll factor to %d based on user requested "
2813 : : "unroll factor %d and suggested vectorization "
2814 : : "factor: %d\n",
2815 : : suggested_unroll_factor, user_unroll, assumed_vf);
2816 : : }
2817 : : }
2818 : :
2819 : 252 : if (suggested_unroll_factor > 1)
2820 : : {
2821 : 240 : if (dump_enabled_p ())
2822 : 44 : dump_printf_loc (MSG_NOTE, vect_location,
2823 : : "***** Re-trying analysis for unrolling"
2824 : : " with unroll factor %d and %s slp.\n",
2825 : : suggested_unroll_factor,
2826 : : single_lane_slp_done_for_suggested_uf
2827 : : ? "single-lane" : "");
2828 : 240 : loop_vec_info unroll_vinfo
2829 : 240 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
2830 : 240 : unroll_vinfo->vector_mode = vector_mode;
2831 : 240 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2832 : 240 : opt_result new_res
2833 : 240 : = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
2834 : : single_lane_slp_done_for_suggested_uf);
2835 : 240 : if (new_res)
2836 : : {
2837 : 194 : delete loop_vinfo;
2838 : 194 : loop_vinfo = unroll_vinfo;
2839 : : }
2840 : : else
2841 : 46 : delete unroll_vinfo;
2842 : : }
2843 : :
2844 : : /* Record that we have honored a user unroll factor. */
2845 : 252 : LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
2846 : : }
2847 : :
2848 : : /* Remember the autodetected vector mode. */
2849 : 410471 : if (vector_mode == VOIDmode)
2850 : 212092 : autodetected_vector_mode = loop_vinfo->vector_mode;
2851 : :
2852 : : /* Advance mode_i, first skipping modes that would result in the
2853 : : same analysis result. */
2854 : 1861861 : while (mode_i + 1 < vector_modes.length ()
2855 : 1304721 : && vect_chooses_same_modes_p (loop_vinfo,
2856 : 579026 : vector_modes[mode_i + 1]))
2857 : : {
2858 : 315224 : if (dump_enabled_p ())
2859 : 15707 : dump_printf_loc (MSG_NOTE, vect_location,
2860 : : "***** The result for vector mode %s would"
2861 : : " be the same\n",
2862 : 15707 : GET_MODE_NAME (vector_modes[mode_i + 1]));
2863 : 315224 : mode_i += 1;
2864 : : }
2865 : 410471 : if (mode_i + 1 < vector_modes.length ()
2866 : 674273 : && vect_chooses_same_modes_p (autodetected_vector_mode,
2867 : 263802 : vector_modes[mode_i + 1]))
2868 : : {
2869 : 347 : if (dump_enabled_p ())
2870 : 9 : dump_printf_loc (MSG_NOTE, vect_location,
2871 : : "***** Skipping vector mode %s, which would"
2872 : : " repeat the analysis for %s\n",
2873 : 9 : GET_MODE_NAME (vector_modes[mode_i + 1]),
2874 : 9 : GET_MODE_NAME (autodetected_vector_mode));
2875 : 347 : mode_i += 1;
2876 : : }
2877 : 410471 : mode_i++;
2878 : :
2879 : 410471 : if (!res)
2880 : : {
2881 : 351249 : delete loop_vinfo;
2882 : 351249 : if (fatal)
2883 : 65085 : gcc_checking_assert (orig_loop_vinfo == NULL);
2884 : 351249 : return opt_loop_vec_info::propagate_failure (res);
2885 : : }
2886 : :
2887 : 59222 : return opt_loop_vec_info::success (loop_vinfo);
2888 : : }
2889 : :
2890 : : /* Function vect_analyze_loop.
2891 : :
2892 : : Apply a set of analyses on LOOP, and create a loop_vec_info struct
2893 : : for it. The different analyses will record information in the
2894 : : loop_vec_info struct. */
2895 : : opt_loop_vec_info
2896 : 473617 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
2897 : : vec_info_shared *shared)
2898 : : {
2899 : 473617 : DUMP_VECT_SCOPE ("analyze_loop_nest");
2900 : :
2901 : 473617 : if (loop_outer (loop)
2902 : 473617 : && loop_vec_info_for_loop (loop_outer (loop))
2903 : 474111 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2904 : 494 : return opt_loop_vec_info::failure_at (vect_location,
2905 : : "outer-loop already vectorized.\n");
2906 : :
2907 : 473123 : if (!find_loop_nest (loop, &shared->loop_nest))
2908 : 22223 : return opt_loop_vec_info::failure_at
2909 : 22223 : (vect_location,
2910 : : "not vectorized: loop nest containing two or more consecutive inner"
2911 : : " loops cannot be vectorized\n");
2912 : :
2913 : : /* Analyze the loop form. */
2914 : 450900 : vect_loop_form_info loop_form_info;
2915 : 450900 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
2916 : : &loop_form_info);
2917 : 450900 : if (!res)
2918 : : {
2919 : 238808 : if (dump_enabled_p ())
2920 : 1703 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2921 : : "bad loop form.\n");
2922 : 238808 : return opt_loop_vec_info::propagate_failure (res);
2923 : : }
2924 : 212092 : if (!integer_onep (loop_form_info.assumptions))
2925 : : {
2926 : : /* We consider to vectorize this loop by versioning it under
2927 : : some assumptions. In order to do this, we need to clear
2928 : : existing information computed by scev and niter analyzer. */
2929 : 10517 : scev_reset_htab ();
2930 : 10517 : free_numbers_of_iterations_estimates (loop);
2931 : : /* Also set flag for this loop so that following scev and niter
2932 : : analysis are done under the assumptions. */
2933 : 10517 : loop_constraint_set (loop, LOOP_C_FINITE);
2934 : : }
2935 : : else
2936 : : /* Clear the existing niter information to make sure the nonwrapping flag
2937 : : will be calculated and set propriately. */
2938 : 201575 : free_numbers_of_iterations_estimates (loop);
2939 : :
2940 : 212092 : auto_vector_modes vector_modes;
2941 : : /* Autodetect first vector size we try. */
2942 : 212092 : vector_modes.safe_push (VOIDmode);
2943 : 212092 : unsigned int autovec_flags
2944 : 424184 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2945 : 212092 : loop->simdlen != 0);
2946 : 212092 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2947 : 212092 : && !unlimited_cost_model (loop));
2948 : 212092 : machine_mode autodetected_vector_mode = VOIDmode;
2949 : 212092 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2950 : 212092 : unsigned int mode_i = 0;
2951 : 212092 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2952 : :
2953 : : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
2954 : : a mode has not been analyzed. */
2955 : 212092 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
2956 : 2137556 : for (unsigned i = 0; i < vector_modes.length (); ++i)
2957 : 856686 : cached_vf_per_mode.safe_push (0);
2958 : :
2959 : : /* First determine the main loop vectorization mode, either the first
2960 : : one that works, starting with auto-detecting the vector mode and then
2961 : : following the targets order of preference, or the one with the
2962 : : lowest cost if pick_lowest_cost_p. */
2963 : 573834 : while (1)
2964 : : {
2965 : 392963 : bool fatal;
2966 : 392963 : unsigned int last_mode_i = mode_i;
2967 : : /* Set cached VF to -1 prior to analysis, which indicates a mode has
2968 : : failed. */
2969 : 392963 : cached_vf_per_mode[last_mode_i] = -1;
2970 : 392963 : opt_loop_vec_info loop_vinfo
2971 : 392963 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
2972 : : NULL, vector_modes, mode_i, -1,
2973 : : autodetected_vector_mode, fatal);
2974 : 392963 : if (fatal)
2975 : : break;
2976 : :
2977 : 327878 : if (loop_vinfo)
2978 : : {
2979 : : /* Analyzis has been successful so update the VF value. The
2980 : : VF should always be a multiple of unroll_factor and we want to
2981 : : capture the original VF here. */
2982 : 52456 : cached_vf_per_mode[last_mode_i]
2983 : 52456 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2984 : 52456 : loop_vinfo->suggested_unroll_factor);
2985 : : /* Once we hit the desired simdlen for the first time,
2986 : : discard any previous attempts. */
2987 : 52456 : if (simdlen
2988 : 52456 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2989 : : {
2990 : 47 : delete first_loop_vinfo;
2991 : : first_loop_vinfo = opt_loop_vec_info::success (NULL);
2992 : : simdlen = 0;
2993 : : }
2994 : 52409 : else if (pick_lowest_cost_p
2995 : 0 : && first_loop_vinfo
2996 : 52409 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2997 : : {
2998 : : /* Pick loop_vinfo over first_loop_vinfo. */
2999 : 0 : delete first_loop_vinfo;
3000 : 0 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3001 : : }
3002 : 52456 : if (first_loop_vinfo == NULL)
3003 : : first_loop_vinfo = loop_vinfo;
3004 : : else
3005 : : {
3006 : 2 : delete loop_vinfo;
3007 : 2 : loop_vinfo = opt_loop_vec_info::success (NULL);
3008 : : }
3009 : :
3010 : : /* Commit to first_loop_vinfo if we have no reason to try
3011 : : alternatives. */
3012 : 52456 : if (!simdlen && !pick_lowest_cost_p)
3013 : : break;
3014 : : }
3015 : 275431 : if (mode_i == vector_modes.length ()
3016 : 275431 : || autodetected_vector_mode == VOIDmode)
3017 : : break;
3018 : :
3019 : : /* Try the next biggest vector size. */
3020 : 180871 : if (dump_enabled_p ())
3021 : 3764 : dump_printf_loc (MSG_NOTE, vect_location,
3022 : : "***** Re-trying analysis with vector mode %s\n",
3023 : 3764 : GET_MODE_NAME (vector_modes[mode_i]));
3024 : 180871 : }
3025 : 212092 : if (!first_loop_vinfo)
3026 : 159643 : return opt_loop_vec_info::propagate_failure (res);
3027 : :
3028 : 52449 : if (dump_enabled_p ())
3029 : 8969 : dump_printf_loc (MSG_NOTE, vect_location,
3030 : : "***** Choosing vector mode %s\n",
3031 : 8969 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3032 : :
3033 : : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3034 : : enabled, SIMDUID is not set, it is the innermost loop and we have
3035 : : either already found the loop's SIMDLEN or there was no SIMDLEN to
3036 : : begin with.
3037 : : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3038 : 52449 : bool vect_epilogues = (!simdlen
3039 : 52447 : && loop->inner == NULL
3040 : 51939 : && param_vect_epilogues_nomask
3041 : 50904 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3042 : : /* No code motion support for multiple epilogues so for now
3043 : : not supported when multiple exits. */
3044 : 25164 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3045 : 24752 : && !loop->simduid
3046 : 75788 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3047 : 52449 : if (!vect_epilogues)
3048 : 39966 : return first_loop_vinfo;
3049 : :
3050 : : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3051 : :
3052 : : /* For epilogues start the analysis from the first mode. The motivation
3053 : : behind starting from the beginning comes from cases where the VECTOR_MODES
3054 : : array may contain length-agnostic and length-specific modes. Their
3055 : : ordering is not guaranteed, so we could end up picking a mode for the main
3056 : : loop that is after the epilogue's optimal mode. */
3057 : 12483 : int masked_p = -1;
3058 : 12483 : if (!unlimited_cost_model (loop)
3059 : 12483 : && (first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3060 : : != VOIDmode))
3061 : : {
3062 : 4 : vector_modes[0]
3063 : 4 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3064 : 4 : cached_vf_per_mode[0] = 0;
3065 : : }
3066 : : else
3067 : 12479 : vector_modes[0] = autodetected_vector_mode;
3068 : 12483 : mode_i = 0;
3069 : :
3070 : 24976 : bool supports_partial_vectors = (param_vect_partial_vector_usage != 0
3071 : 12483 : || masked_p == 1);
3072 : : machine_mode mask_mode;
3073 : : if (supports_partial_vectors
3074 : 29 : && !partial_vectors_supported_p ()
3075 : 29 : && !(VECTOR_MODE_P (first_loop_vinfo->vector_mode)
3076 : 29 : && targetm.vectorize.get_mask_mode
3077 : 12493 : (first_loop_vinfo->vector_mode).exists (&mask_mode)
3078 : 29 : && SCALAR_INT_MODE_P (mask_mode)))
3079 : 19 : supports_partial_vectors = false;
3080 : 12483 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3081 : :
3082 : 12483 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3083 : 12667 : do
3084 : : {
3085 : : /* Let the user override what the target suggests. */
3086 : 12575 : if (OPTION_SET_P (param_vect_partial_vector_usage))
3087 : 34 : masked_p = -1;
3088 : :
3089 : 43098 : while (1)
3090 : : {
3091 : : /* If the target does not support partial vectors we can shorten the
3092 : : number of modes to analyze for the epilogue as we know we can't
3093 : : pick a mode that would lead to a VF at least as big as the
3094 : : FIRST_VINFO_VF. */
3095 : 56413 : if (!supports_partial_vectors
3096 : 43098 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3097 : : {
3098 : 13340 : mode_i++;
3099 : 26680 : if (mode_i == vector_modes.length ())
3100 : : break;
3101 : 25565 : continue;
3102 : : }
3103 : : /* We would need an exhaustive search to find all modes we
3104 : : skipped but that would lead to the same result as the
3105 : : analysis it was skipped for and where we'd could check
3106 : : cached_vf_per_mode against.
3107 : : Check for the autodetected mode, which is the common
3108 : : situation on x86 which does not perform cost comparison. */
3109 : 42008 : if (!supports_partial_vectors
3110 : 29734 : && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf)
3111 : 58986 : && vect_chooses_same_modes_p (autodetected_vector_mode,
3112 : 29228 : vector_modes[mode_i]))
3113 : : {
3114 : 12250 : mode_i++;
3115 : 24500 : if (mode_i == vector_modes.length ())
3116 : : break;
3117 : 12250 : continue;
3118 : : }
3119 : :
3120 : 17508 : if (dump_enabled_p ())
3121 : 3044 : dump_printf_loc (MSG_NOTE, vect_location,
3122 : : "***** Re-trying epilogue analysis with vector "
3123 : 3044 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3124 : :
3125 : 17508 : bool fatal;
3126 : 17508 : opt_loop_vec_info loop_vinfo
3127 : 17508 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3128 : : orig_loop_vinfo,
3129 : : vector_modes, mode_i, masked_p,
3130 : : autodetected_vector_mode, fatal);
3131 : 17508 : if (fatal)
3132 : : break;
3133 : :
3134 : 17508 : if (loop_vinfo)
3135 : : {
3136 : 6766 : if (pick_lowest_cost_p
3137 : 0 : && orig_loop_vinfo->epilogue_vinfo
3138 : 6766 : && vect_joust_loop_vinfos (loop_vinfo,
3139 : 0 : orig_loop_vinfo->epilogue_vinfo))
3140 : : {
3141 : 0 : gcc_assert (vect_epilogues);
3142 : 0 : delete orig_loop_vinfo->epilogue_vinfo;
3143 : 0 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3144 : : }
3145 : 6766 : if (!orig_loop_vinfo->epilogue_vinfo)
3146 : 6766 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3147 : : else
3148 : : {
3149 : 0 : delete loop_vinfo;
3150 : 0 : loop_vinfo = opt_loop_vec_info::success (NULL);
3151 : : }
3152 : :
3153 : : /* For now only allow one epilogue loop, but allow
3154 : : pick_lowest_cost_p to replace it, so commit to the
3155 : : first epilogue if we have no reason to try alternatives. */
3156 : 6766 : if (!pick_lowest_cost_p)
3157 : : break;
3158 : : }
3159 : :
3160 : : /* Revert back to the default from the suggested prefered
3161 : : epilogue vectorization mode. */
3162 : 10742 : masked_p = -1;
3163 : 21484 : if (mode_i == vector_modes.length ())
3164 : : break;
3165 : : }
3166 : :
3167 : 12575 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3168 : 12575 : if (!orig_loop_vinfo)
3169 : : break;
3170 : :
3171 : : /* When we selected a first vectorized epilogue, see if the target
3172 : : suggests to have another one. */
3173 : 6766 : masked_p = -1;
3174 : 6766 : if (!unlimited_cost_model (loop)
3175 : 3962 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo)
3176 : 10724 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3177 : : != VOIDmode))
3178 : : {
3179 : 184 : vector_modes[0]
3180 : 92 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3181 : 92 : cached_vf_per_mode[0] = 0;
3182 : 92 : mode_i = 0;
3183 : : }
3184 : : else
3185 : : break;
3186 : 92 : }
3187 : : while (1);
3188 : :
3189 : 12483 : if (first_loop_vinfo->epilogue_vinfo)
3190 : : {
3191 : 6678 : poly_uint64 lowest_th
3192 : 6678 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3193 : 6678 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3194 : 6766 : do
3195 : : {
3196 : 6766 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3197 : 6766 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3198 : : || maybe_ne (lowest_th, 0U));
3199 : : /* Keep track of the known smallest versioning threshold. */
3200 : 6766 : if (ordered_p (lowest_th, th))
3201 : 6766 : lowest_th = ordered_min (lowest_th, th);
3202 : 6766 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3203 : : }
3204 : 6766 : while (epilog_vinfo);
3205 : 6678 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3206 : 6678 : if (dump_enabled_p ())
3207 : 1308 : dump_printf_loc (MSG_NOTE, vect_location,
3208 : : "***** Choosing epilogue vector mode %s\n",
3209 : 1308 : GET_MODE_NAME
3210 : : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3211 : : }
3212 : :
3213 : 12483 : return first_loop_vinfo;
3214 : 662992 : }
3215 : :
3216 : : /* Return true if there is an in-order reduction function for CODE, storing
3217 : : it in *REDUC_FN if so. */
3218 : :
3219 : : static bool
3220 : 4941 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3221 : : {
3222 : : /* We support MINUS_EXPR by negating the operand. This also preserves an
3223 : : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3224 : : (-0.0) = -0.0. */
3225 : 4941 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3226 : : {
3227 : 4265 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3228 : 0 : return true;
3229 : : }
3230 : : return false;
3231 : : }
3232 : :
3233 : : /* Function reduction_fn_for_scalar_code
3234 : :
3235 : : Input:
3236 : : CODE - tree_code of a reduction operations.
3237 : :
3238 : : Output:
3239 : : REDUC_FN - the corresponding internal function to be used to reduce the
3240 : : vector of partial results into a single scalar result, or IFN_LAST
3241 : : if the operation is a supported reduction operation, but does not have
3242 : : such an internal function.
3243 : :
3244 : : Return FALSE if CODE currently cannot be vectorized as reduction. */
3245 : :
3246 : : bool
3247 : 1996231 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3248 : : {
3249 : 1996231 : if (code.is_tree_code ())
3250 : 1996177 : switch (tree_code (code))
3251 : : {
3252 : 14234 : case MAX_EXPR:
3253 : 14234 : *reduc_fn = IFN_REDUC_MAX;
3254 : 14234 : return true;
3255 : :
3256 : 50709 : case MIN_EXPR:
3257 : 50709 : *reduc_fn = IFN_REDUC_MIN;
3258 : 50709 : return true;
3259 : :
3260 : 1076053 : case PLUS_EXPR:
3261 : 1076053 : *reduc_fn = IFN_REDUC_PLUS;
3262 : 1076053 : return true;
3263 : :
3264 : 254353 : case BIT_AND_EXPR:
3265 : 254353 : *reduc_fn = IFN_REDUC_AND;
3266 : 254353 : return true;
3267 : :
3268 : 283722 : case BIT_IOR_EXPR:
3269 : 283722 : *reduc_fn = IFN_REDUC_IOR;
3270 : 283722 : return true;
3271 : :
3272 : 43330 : case BIT_XOR_EXPR:
3273 : 43330 : *reduc_fn = IFN_REDUC_XOR;
3274 : 43330 : return true;
3275 : :
3276 : 273776 : case MULT_EXPR:
3277 : 273776 : case MINUS_EXPR:
3278 : 273776 : *reduc_fn = IFN_LAST;
3279 : 273776 : return true;
3280 : :
3281 : : default:
3282 : : return false;
3283 : : }
3284 : : else
3285 : 54 : switch (combined_fn (code))
3286 : : {
3287 : 30 : CASE_CFN_FMAX:
3288 : 30 : *reduc_fn = IFN_REDUC_FMAX;
3289 : 30 : return true;
3290 : :
3291 : 24 : CASE_CFN_FMIN:
3292 : 24 : *reduc_fn = IFN_REDUC_FMIN;
3293 : 24 : return true;
3294 : :
3295 : : default:
3296 : : return false;
3297 : : }
3298 : : }
3299 : :
3300 : : /* If there is a neutral value X such that a reduction would not be affected
3301 : : by the introduction of additional X elements, return that X, otherwise
3302 : : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3303 : : of the scalar elements. If the reduction has just a single initial value
3304 : : then INITIAL_VALUE is that value, otherwise it is null.
3305 : : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3306 : : In that case no signed zero is returned. */
3307 : :
3308 : : tree
3309 : 72534 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3310 : : tree initial_value, bool as_initial)
3311 : : {
3312 : 72534 : if (code.is_tree_code ())
3313 : 72480 : switch (tree_code (code))
3314 : : {
3315 : 10435 : case DOT_PROD_EXPR:
3316 : 10435 : case SAD_EXPR:
3317 : 10435 : case MINUS_EXPR:
3318 : 10435 : case BIT_IOR_EXPR:
3319 : 10435 : case BIT_XOR_EXPR:
3320 : 10435 : return build_zero_cst (scalar_type);
3321 : 56234 : case WIDEN_SUM_EXPR:
3322 : 56234 : case PLUS_EXPR:
3323 : 56234 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3324 : 19 : return build_real (scalar_type, dconstm0);
3325 : : else
3326 : 56215 : return build_zero_cst (scalar_type);
3327 : :
3328 : 1990 : case MULT_EXPR:
3329 : 1990 : return build_one_cst (scalar_type);
3330 : :
3331 : 1343 : case BIT_AND_EXPR:
3332 : 1343 : return build_all_ones_cst (scalar_type);
3333 : :
3334 : : case MAX_EXPR:
3335 : : case MIN_EXPR:
3336 : : return initial_value;
3337 : :
3338 : 376 : default:
3339 : 376 : return NULL_TREE;
3340 : : }
3341 : : else
3342 : 54 : switch (combined_fn (code))
3343 : : {
3344 : : CASE_CFN_FMIN:
3345 : : CASE_CFN_FMAX:
3346 : : return initial_value;
3347 : :
3348 : 0 : default:
3349 : 0 : return NULL_TREE;
3350 : : }
3351 : : }
3352 : :
3353 : : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3354 : : STMT is printed with a message MSG. */
3355 : :
3356 : : static void
3357 : 491 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3358 : : {
3359 : 491 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3360 : 491 : }
3361 : :
3362 : : /* Return true if we need an in-order reduction for operation CODE
3363 : : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3364 : : overflow must wrap. */
3365 : :
3366 : : bool
3367 : 6361606 : needs_fold_left_reduction_p (tree type, code_helper code)
3368 : : {
3369 : : /* CHECKME: check for !flag_finite_math_only too? */
3370 : 6361606 : if (SCALAR_FLOAT_TYPE_P (type))
3371 : : {
3372 : 541885 : if (code.is_tree_code ())
3373 : 541835 : switch (tree_code (code))
3374 : : {
3375 : : case MIN_EXPR:
3376 : : case MAX_EXPR:
3377 : : return false;
3378 : :
3379 : 540375 : default:
3380 : 540375 : return !flag_associative_math;
3381 : : }
3382 : : else
3383 : 50 : switch (combined_fn (code))
3384 : : {
3385 : : CASE_CFN_FMIN:
3386 : : CASE_CFN_FMAX:
3387 : : return false;
3388 : :
3389 : 2 : default:
3390 : 2 : return !flag_associative_math;
3391 : : }
3392 : : }
3393 : :
3394 : 5819721 : if (INTEGRAL_TYPE_P (type))
3395 : 5818928 : return (!code.is_tree_code ()
3396 : 5818928 : || !operation_no_trapping_overflow (type, tree_code (code)));
3397 : :
3398 : 793 : if (SAT_FIXED_POINT_TYPE_P (type))
3399 : : return true;
3400 : :
3401 : : return false;
3402 : : }
3403 : :
3404 : : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3405 : : has a handled computation expression. Store the main reduction
3406 : : operation in *CODE. */
3407 : :
3408 : : static bool
3409 : 63806 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3410 : : tree loop_arg, code_helper *code,
3411 : : vec<std::pair<ssa_op_iter, use_operand_p> > &path,
3412 : : bool inner_loop_of_double_reduc)
3413 : : {
3414 : 63806 : auto_bitmap visited;
3415 : 63806 : tree lookfor = PHI_RESULT (phi);
3416 : 63806 : ssa_op_iter curri;
3417 : 63806 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3418 : 134124 : while (USE_FROM_PTR (curr) != loop_arg)
3419 : 6512 : curr = op_iter_next_use (&curri);
3420 : 63806 : curri.i = curri.numops;
3421 : 616378 : do
3422 : : {
3423 : 616378 : path.safe_push (std::make_pair (curri, curr));
3424 : 616378 : tree use = USE_FROM_PTR (curr);
3425 : 616378 : if (use == lookfor)
3426 : : break;
3427 : 552863 : gimple *def = SSA_NAME_DEF_STMT (use);
3428 : 552863 : if (gimple_nop_p (def)
3429 : 552863 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3430 : : {
3431 : 467913 : pop:
3432 : 467913 : do
3433 : : {
3434 : 467913 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3435 : 467913 : curri = x.first;
3436 : 467913 : curr = x.second;
3437 : 512542 : do
3438 : 512542 : curr = op_iter_next_use (&curri);
3439 : : /* Skip already visited or non-SSA operands (from iterating
3440 : : over PHI args). */
3441 : : while (curr != NULL_USE_OPERAND_P
3442 : 1025084 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3443 : 176580 : || ! bitmap_set_bit (visited,
3444 : 176580 : SSA_NAME_VERSION
3445 : : (USE_FROM_PTR (curr)))));
3446 : : }
3447 : 935826 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3448 : 157728 : if (curr == NULL_USE_OPERAND_P)
3449 : : break;
3450 : : }
3451 : : else
3452 : : {
3453 : 463516 : if (gimple_code (def) == GIMPLE_PHI)
3454 : 48557 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3455 : : else
3456 : 414959 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3457 : : while (curr != NULL_USE_OPERAND_P
3458 : 557146 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3459 : 484955 : || ! bitmap_set_bit (visited,
3460 : 484955 : SSA_NAME_VERSION
3461 : : (USE_FROM_PTR (curr)))))
3462 : 93630 : curr = op_iter_next_use (&curri);
3463 : 463516 : if (curr == NULL_USE_OPERAND_P)
3464 : 68381 : goto pop;
3465 : : }
3466 : : }
3467 : : while (1);
3468 : 63806 : if (dump_file && (dump_flags & TDF_DETAILS))
3469 : : {
3470 : 3644 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3471 : 3644 : unsigned i;
3472 : 3644 : std::pair<ssa_op_iter, use_operand_p> *x;
3473 : 12504 : FOR_EACH_VEC_ELT (path, i, x)
3474 : 8860 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3475 : 3644 : dump_printf (MSG_NOTE, "\n");
3476 : : }
3477 : :
3478 : : /* Check whether the reduction path detected is valid. */
3479 : 63806 : bool fail = path.length () == 0;
3480 : 63806 : bool neg = false;
3481 : 63806 : int sign = -1;
3482 : 63806 : *code = ERROR_MARK;
3483 : 139798 : for (unsigned i = 1; i < path.length (); ++i)
3484 : : {
3485 : 78911 : gimple *use_stmt = USE_STMT (path[i].second);
3486 : 78911 : gimple_match_op op;
3487 : 78911 : if (!gimple_extract_op (use_stmt, &op))
3488 : : {
3489 : : fail = true;
3490 : 2919 : break;
3491 : : }
3492 : 78352 : unsigned int opi = op.num_ops;
3493 : 78352 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3494 : : {
3495 : : /* The following make sure we can compute the operand index
3496 : : easily plus it mostly disallows chaining via COND_EXPR condition
3497 : : operands. */
3498 : 124469 : for (opi = 0; opi < op.num_ops; ++opi)
3499 : 123528 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3500 : : break;
3501 : : }
3502 : 3150 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3503 : : {
3504 : 6320 : for (opi = 0; opi < op.num_ops; ++opi)
3505 : 6320 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3506 : : break;
3507 : : }
3508 : 78352 : if (opi == op.num_ops)
3509 : : {
3510 : : fail = true;
3511 : : break;
3512 : : }
3513 : 77411 : op.code = canonicalize_code (op.code, op.type);
3514 : 77411 : if (op.code == MINUS_EXPR)
3515 : : {
3516 : 3844 : op.code = PLUS_EXPR;
3517 : : /* Track whether we negate the reduction value each iteration. */
3518 : 3844 : if (op.ops[1] == op.ops[opi])
3519 : 32 : neg = ! neg;
3520 : : }
3521 : 73567 : else if (op.code == IFN_COND_SUB)
3522 : : {
3523 : 2 : op.code = IFN_COND_ADD;
3524 : : /* Track whether we negate the reduction value each iteration. */
3525 : 2 : if (op.ops[2] == op.ops[opi])
3526 : 0 : neg = ! neg;
3527 : : }
3528 : : /* For an FMA the reduction code is the PLUS if the addition chain
3529 : : is the reduction. */
3530 : 73565 : else if (op.code == IFN_FMA && opi == 2)
3531 : 28 : op.code = PLUS_EXPR;
3532 : 77411 : if (CONVERT_EXPR_CODE_P (op.code)
3533 : 77411 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3534 : : ;
3535 : 73995 : else if (*code == ERROR_MARK)
3536 : : {
3537 : 62098 : *code = op.code;
3538 : 62098 : sign = TYPE_SIGN (op.type);
3539 : : }
3540 : 11897 : else if (op.code != *code)
3541 : : {
3542 : : fail = true;
3543 : : break;
3544 : : }
3545 : 10681 : else if ((op.code == MIN_EXPR
3546 : 10597 : || op.code == MAX_EXPR)
3547 : 10693 : && sign != TYPE_SIGN (op.type))
3548 : : {
3549 : : fail = true;
3550 : : break;
3551 : : }
3552 : : /* Check there's only a single stmt the op is used on. For the
3553 : : not value-changing tail and the last stmt allow out-of-loop uses,
3554 : : but not when this is the inner loop of a double reduction.
3555 : : ??? We could relax this and handle arbitrary live stmts by
3556 : : forcing a scalar epilogue for example. */
3557 : 76192 : imm_use_iterator imm_iter;
3558 : 76192 : use_operand_p use_p;
3559 : 76192 : gimple *op_use_stmt;
3560 : 76192 : unsigned cnt = 0;
3561 : 79312 : bool cond_fn_p = op.code.is_internal_fn ()
3562 : 3120 : && (conditional_internal_fn_code (internal_fn (op.code))
3563 : 76192 : != ERROR_MARK);
3564 : :
3565 : 177523 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3566 : : {
3567 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
3568 : : have op1 twice (once as definition, once as else) in the same
3569 : : operation. Enforce this. */
3570 : 101331 : if (cond_fn_p && op_use_stmt == use_stmt)
3571 : : {
3572 : 3064 : gcall *call = as_a<gcall *> (use_stmt);
3573 : 3064 : unsigned else_pos
3574 : 3064 : = internal_fn_else_index (internal_fn (op.code));
3575 : 3064 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
3576 : : {
3577 : : fail = true;
3578 : : break;
3579 : : }
3580 : 15320 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
3581 : : {
3582 : 12256 : if (j == else_pos)
3583 : 3064 : continue;
3584 : 9192 : if (gimple_call_arg (call, j) == op.ops[opi])
3585 : 3064 : cnt++;
3586 : : }
3587 : : }
3588 : 98267 : else if (!is_gimple_debug (op_use_stmt)
3589 : 98267 : && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
3590 : 1765 : || flow_bb_inside_loop_p (loop,
3591 : 1765 : gimple_bb (op_use_stmt))))
3592 : 146799 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3593 : 73404 : cnt++;
3594 : 76192 : }
3595 : :
3596 : 76192 : if (cnt != 1)
3597 : : {
3598 : : fail = true;
3599 : : break;
3600 : : }
3601 : : }
3602 : 67025 : return ! fail && ! neg && *code != ERROR_MARK;
3603 : 63806 : }
3604 : :
3605 : : bool
3606 : 21 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3607 : : tree loop_arg, enum tree_code code)
3608 : : {
3609 : 21 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3610 : 21 : code_helper code_;
3611 : 21 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
3612 : 21 : && code_ == code);
3613 : 21 : }
3614 : :
3615 : :
3616 : :
3617 : : /* Function vect_is_simple_reduction
3618 : :
3619 : : (1) Detect a cross-iteration def-use cycle that represents a simple
3620 : : reduction computation. We look for the following pattern:
3621 : :
3622 : : loop_header:
3623 : : a1 = phi < a0, a2 >
3624 : : a3 = ...
3625 : : a2 = operation (a3, a1)
3626 : :
3627 : : or
3628 : :
3629 : : a3 = ...
3630 : : loop_header:
3631 : : a1 = phi < a0, a2 >
3632 : : a2 = operation (a3, a1)
3633 : :
3634 : : such that:
3635 : : 1. operation is commutative and associative and it is safe to
3636 : : change the order of the computation
3637 : : 2. no uses for a2 in the loop (a2 is used out of the loop)
3638 : : 3. no uses of a1 in the loop besides the reduction operation
3639 : : 4. no uses of a1 outside the loop.
3640 : :
3641 : : Conditions 1,4 are tested here.
3642 : : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3643 : :
3644 : : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3645 : : nested cycles.
3646 : :
3647 : : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3648 : : reductions:
3649 : :
3650 : : a1 = phi < a0, a2 >
3651 : : inner loop (def of a3)
3652 : : a2 = phi < a3 >
3653 : :
3654 : : (4) Detect condition expressions, ie:
3655 : : for (int i = 0; i < N; i++)
3656 : : if (a[i] < val)
3657 : : ret_val = a[i];
3658 : :
3659 : : */
3660 : :
3661 : : static stmt_vec_info
3662 : 84795 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3663 : : gphi **double_reduc)
3664 : : {
3665 : 84795 : gphi *phi = as_a <gphi *> (phi_info->stmt);
3666 : 84795 : gimple *phi_use_stmt = NULL;
3667 : 84795 : imm_use_iterator imm_iter;
3668 : 84795 : use_operand_p use_p;
3669 : :
3670 : : /* When double_reduc is NULL we are testing the inner loop of a
3671 : : double reduction. */
3672 : 84795 : bool inner_loop_of_double_reduc = double_reduc == NULL;
3673 : 84795 : if (double_reduc)
3674 : 83812 : *double_reduc = NULL;
3675 : 84795 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3676 : :
3677 : 84795 : tree phi_name = PHI_RESULT (phi);
3678 : : /* ??? If there are no uses of the PHI result the inner loop reduction
3679 : : won't be detected as possibly double-reduction by vectorizable_reduction
3680 : : because that tries to walk the PHI arg from the preheader edge which
3681 : : can be constant. See PR60382. */
3682 : 84795 : if (has_zero_uses (phi_name))
3683 : : return NULL;
3684 : 84667 : class loop *loop = (gimple_bb (phi))->loop_father;
3685 : 84667 : unsigned nphi_def_loop_uses = 0;
3686 : 210043 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3687 : : {
3688 : 129204 : gimple *use_stmt = USE_STMT (use_p);
3689 : 129204 : if (is_gimple_debug (use_stmt))
3690 : 31252 : continue;
3691 : :
3692 : 97952 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3693 : : {
3694 : 3828 : if (dump_enabled_p ())
3695 : 30 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3696 : : "intermediate value used outside loop.\n");
3697 : :
3698 : 3828 : return NULL;
3699 : : }
3700 : :
3701 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
3702 : : op1 twice (once as definition, once as else) in the same operation.
3703 : : Only count it as one. */
3704 : 94124 : if (use_stmt != phi_use_stmt)
3705 : : {
3706 : 90799 : nphi_def_loop_uses++;
3707 : 90799 : phi_use_stmt = use_stmt;
3708 : : }
3709 : : }
3710 : :
3711 : 80839 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3712 : 80839 : if (TREE_CODE (latch_def) != SSA_NAME)
3713 : : {
3714 : 1222 : if (dump_enabled_p ())
3715 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3716 : : "reduction: not ssa_name: %T\n", latch_def);
3717 : 1222 : return NULL;
3718 : : }
3719 : :
3720 : 79617 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3721 : 79617 : if (!def_stmt_info
3722 : 79617 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3723 : 135 : return NULL;
3724 : :
3725 : 79482 : bool nested_in_vect_loop
3726 : 79482 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3727 : 79482 : unsigned nlatch_def_loop_uses = 0;
3728 : 79482 : auto_vec<gphi *, 3> lcphis;
3729 : 299811 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3730 : : {
3731 : 220329 : gimple *use_stmt = USE_STMT (use_p);
3732 : 220329 : if (is_gimple_debug (use_stmt))
3733 : 61091 : continue;
3734 : 159238 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3735 : 88116 : nlatch_def_loop_uses++;
3736 : : else
3737 : : /* We can have more than one loop-closed PHI. */
3738 : 71122 : lcphis.safe_push (as_a <gphi *> (use_stmt));
3739 : : }
3740 : :
3741 : : /* If we are vectorizing an inner reduction we are executing that
3742 : : in the original order only in case we are not dealing with a
3743 : : double reduction. */
3744 : 79482 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3745 : : {
3746 : 2185 : if (dump_enabled_p ())
3747 : 361 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3748 : : "detected nested cycle: ");
3749 : 2185 : return def_stmt_info;
3750 : : }
3751 : :
3752 : : /* When the inner loop of a double reduction ends up with more than
3753 : : one loop-closed PHI we have failed to classify alternate such
3754 : : PHIs as double reduction, leading to wrong code. See PR103237. */
3755 : 78268 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
3756 : : {
3757 : 1 : if (dump_enabled_p ())
3758 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3759 : : "unhandle double reduction\n");
3760 : 1 : return NULL;
3761 : : }
3762 : :
3763 : : /* If this isn't a nested cycle or if the nested cycle reduction value
3764 : : is used ouside of the inner loop we cannot handle uses of the reduction
3765 : : value. */
3766 : 77296 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3767 : : {
3768 : 12362 : if (dump_enabled_p ())
3769 : 316 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3770 : : "reduction used in loop.\n");
3771 : 12362 : return NULL;
3772 : : }
3773 : :
3774 : : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3775 : : defined in the inner loop. */
3776 : 64934 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3777 : : {
3778 : 1149 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
3779 : 1149 : if (gimple_phi_num_args (def_stmt) != 1
3780 : 1149 : || TREE_CODE (op1) != SSA_NAME)
3781 : : {
3782 : 52 : if (dump_enabled_p ())
3783 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3784 : : "unsupported phi node definition.\n");
3785 : :
3786 : 52 : return NULL;
3787 : : }
3788 : :
3789 : : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3790 : : and the latch definition op1. */
3791 : 1097 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
3792 : 1097 : if (gimple_bb (def1)
3793 : 1097 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3794 : 1097 : && loop->inner
3795 : 1089 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3796 : 1089 : && (is_gimple_assign (def1) || is_gimple_call (def1))
3797 : 1080 : && is_a <gphi *> (phi_use_stmt)
3798 : 1069 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
3799 : 1069 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
3800 : : loop_latch_edge (loop->inner)))
3801 : 2164 : && lcphis.length () == 1)
3802 : : {
3803 : 983 : if (dump_enabled_p ())
3804 : 130 : report_vect_op (MSG_NOTE, def_stmt,
3805 : : "detected double reduction: ");
3806 : :
3807 : 983 : *double_reduc = as_a <gphi *> (phi_use_stmt);
3808 : 983 : return def_stmt_info;
3809 : : }
3810 : :
3811 : 114 : return NULL;
3812 : : }
3813 : :
3814 : : /* Look for the expression computing latch_def from then loop PHI result. */
3815 : 63785 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3816 : 63785 : code_helper code;
3817 : 63785 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3818 : : path, inner_loop_of_double_reduc))
3819 : : {
3820 : 60566 : STMT_VINFO_REDUC_CODE (phi_info) = code;
3821 : 60566 : if (code == COND_EXPR && !nested_in_vect_loop)
3822 : 4145 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3823 : :
3824 : : /* Fill in STMT_VINFO_REDUC_IDX. */
3825 : 60566 : unsigned i;
3826 : 195628 : for (i = path.length () - 1; i >= 1; --i)
3827 : : {
3828 : 74496 : gimple *stmt = USE_STMT (path[i].second);
3829 : 74496 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3830 : 74496 : gimple_match_op op;
3831 : 74496 : if (!gimple_extract_op (stmt, &op))
3832 : 0 : gcc_unreachable ();
3833 : 74496 : if (gassign *assign = dyn_cast<gassign *> (stmt))
3834 : 71366 : STMT_VINFO_REDUC_IDX (stmt_info)
3835 : 71366 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3836 : : else
3837 : : {
3838 : 3130 : gcall *call = as_a<gcall *> (stmt);
3839 : 3130 : STMT_VINFO_REDUC_IDX (stmt_info)
3840 : 3130 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
3841 : : }
3842 : : }
3843 : 60566 : if (dump_enabled_p ())
3844 : 3578 : dump_printf_loc (MSG_NOTE, vect_location,
3845 : : "reduction: detected reduction\n");
3846 : :
3847 : 60566 : return def_stmt_info;
3848 : : }
3849 : :
3850 : 3219 : if (dump_enabled_p ())
3851 : 80 : dump_printf_loc (MSG_NOTE, vect_location,
3852 : : "reduction: unknown pattern\n");
3853 : :
3854 : : return NULL;
3855 : 143267 : }
3856 : :
3857 : : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3858 : : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3859 : : or -1 if not known. */
3860 : :
3861 : : static int
3862 : 341941 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3863 : : {
3864 : 341941 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3865 : 341941 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3866 : : {
3867 : 135755 : if (dump_enabled_p ())
3868 : 2751 : dump_printf_loc (MSG_NOTE, vect_location,
3869 : : "cost model: epilogue peel iters set to vf/2 "
3870 : : "because loop iterations are unknown .\n");
3871 : 135755 : return assumed_vf / 2;
3872 : : }
3873 : : else
3874 : : {
3875 : 206186 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3876 : 206186 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
3877 : 206186 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3878 : : /* If we need to peel for gaps, but no peeling is required, we have to
3879 : : peel VF iterations. */
3880 : 206186 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3881 : 206186 : peel_iters_epilogue = assumed_vf;
3882 : 206186 : return peel_iters_epilogue;
3883 : : }
3884 : : }
3885 : :
3886 : : /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3887 : : int
3888 : 260533 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3889 : : int *peel_iters_epilogue,
3890 : : stmt_vector_for_cost *scalar_cost_vec,
3891 : : stmt_vector_for_cost *prologue_cost_vec,
3892 : : stmt_vector_for_cost *epilogue_cost_vec)
3893 : : {
3894 : 260533 : int retval = 0;
3895 : :
3896 : 260533 : *peel_iters_epilogue
3897 : 260533 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3898 : :
3899 : 260533 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3900 : : {
3901 : : /* If peeled iterations are known but number of scalar loop
3902 : : iterations are unknown, count a taken branch per peeled loop. */
3903 : 87733 : if (peel_iters_prologue > 0)
3904 : 52436 : retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3905 : : vect_prologue);
3906 : 87733 : if (*peel_iters_epilogue > 0)
3907 : 87661 : retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3908 : : vect_epilogue);
3909 : : }
3910 : :
3911 : 260533 : stmt_info_for_cost *si;
3912 : 260533 : int j;
3913 : 260533 : if (peel_iters_prologue)
3914 : 642265 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3915 : 532542 : retval += record_stmt_cost (prologue_cost_vec,
3916 : 532542 : si->count * peel_iters_prologue,
3917 : : si->kind, si->stmt_info, si->misalign,
3918 : : vect_prologue);
3919 : 260533 : if (*peel_iters_epilogue)
3920 : 1001315 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3921 : 827885 : retval += record_stmt_cost (epilogue_cost_vec,
3922 : 827885 : si->count * *peel_iters_epilogue,
3923 : : si->kind, si->stmt_info, si->misalign,
3924 : : vect_epilogue);
3925 : :
3926 : 260533 : return retval;
3927 : : }
3928 : :
3929 : : /* Function vect_estimate_min_profitable_iters
3930 : :
3931 : : Return the number of iterations required for the vector version of the
3932 : : loop to be profitable relative to the cost of the scalar version of the
3933 : : loop.
3934 : :
3935 : : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3936 : : of iterations for vectorization. -1 value means loop vectorization
3937 : : is not profitable. This returned value may be used for dynamic
3938 : : profitability check.
3939 : :
3940 : : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3941 : : for static check against estimated number of iterations. */
3942 : :
3943 : : static void
3944 : 97759 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3945 : : int *ret_min_profitable_niters,
3946 : : int *ret_min_profitable_estimate,
3947 : : unsigned *suggested_unroll_factor)
3948 : : {
3949 : 97759 : int min_profitable_iters;
3950 : 97759 : int min_profitable_estimate;
3951 : 97759 : int peel_iters_prologue;
3952 : 97759 : int peel_iters_epilogue;
3953 : 97759 : unsigned vec_inside_cost = 0;
3954 : 97759 : int vec_outside_cost = 0;
3955 : 97759 : unsigned vec_prologue_cost = 0;
3956 : 97759 : unsigned vec_epilogue_cost = 0;
3957 : 97759 : int scalar_single_iter_cost = 0;
3958 : 97759 : int scalar_outside_cost = 0;
3959 : 97759 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3960 : 97759 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3961 : 97759 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
3962 : :
3963 : : /* Cost model disabled. */
3964 : 97759 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3965 : : {
3966 : 16163 : if (dump_enabled_p ())
3967 : 9916 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3968 : 16163 : *ret_min_profitable_niters = 0;
3969 : 16163 : *ret_min_profitable_estimate = 0;
3970 : 16163 : return;
3971 : : }
3972 : :
3973 : : /* Requires loop versioning tests to handle misalignment. */
3974 : 81596 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3975 : : {
3976 : : /* FIXME: Make cost depend on complexity of individual check. */
3977 : 27 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3978 : 27 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3979 : 27 : if (dump_enabled_p ())
3980 : 1 : dump_printf (MSG_NOTE,
3981 : : "cost model: Adding cost of checks for loop "
3982 : : "versioning to treat misalignment.\n");
3983 : : }
3984 : :
3985 : : /* Requires loop versioning with alias checks. */
3986 : 81596 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3987 : : {
3988 : : /* FIXME: Make cost depend on complexity of individual check. */
3989 : 4059 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3990 : 4059 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3991 : 4059 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3992 : 0 : if (len)
3993 : : /* Count LEN - 1 ANDs and LEN comparisons. */
3994 : 0 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3995 : : scalar_stmt, vect_prologue);
3996 : 4059 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3997 : 1106 : if (len)
3998 : : {
3999 : : /* Count LEN - 1 ANDs and LEN comparisons. */
4000 : 1106 : unsigned int nstmts = len * 2 - 1;
4001 : : /* +1 for each bias that needs adding. */
4002 : 2212 : for (unsigned int i = 0; i < len; ++i)
4003 : 1106 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4004 : 123 : nstmts += 1;
4005 : 1106 : (void) add_stmt_cost (target_cost_data, nstmts,
4006 : : scalar_stmt, vect_prologue);
4007 : : }
4008 : 4059 : if (dump_enabled_p ())
4009 : 15 : dump_printf (MSG_NOTE,
4010 : : "cost model: Adding cost of checks for loop "
4011 : : "versioning aliasing.\n");
4012 : : }
4013 : :
4014 : : /* Requires loop versioning with niter checks. */
4015 : 81596 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4016 : : {
4017 : : /* FIXME: Make cost depend on complexity of individual check. */
4018 : 684 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4019 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4020 : 684 : if (dump_enabled_p ())
4021 : 1 : dump_printf (MSG_NOTE,
4022 : : "cost model: Adding cost of checks for loop "
4023 : : "versioning niters.\n");
4024 : : }
4025 : :
4026 : 81596 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4027 : 4754 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4028 : : vect_prologue);
4029 : :
4030 : : /* Count statements in scalar loop. Using this as scalar cost for a single
4031 : : iteration for now.
4032 : :
4033 : : TODO: Add outer loop support.
4034 : :
4035 : : TODO: Consider assigning different costs to different scalar
4036 : : statements. */
4037 : :
4038 : 81596 : scalar_single_iter_cost = (loop_vinfo->scalar_costs->total_cost ()
4039 : 81596 : * param_vect_scalar_cost_multiplier) / 100;
4040 : :
4041 : : /* Add additional cost for the peeled instructions in prologue and epilogue
4042 : : loop. (For fully-masked loops there will be no peeling.)
4043 : :
4044 : : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4045 : : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4046 : :
4047 : : TODO: Build an expression that represents peel_iters for prologue and
4048 : : epilogue to be used in a run-time test. */
4049 : :
4050 : 81596 : bool prologue_need_br_taken_cost = false;
4051 : 81596 : bool prologue_need_br_not_taken_cost = false;
4052 : :
4053 : : /* Calculate peel_iters_prologue. */
4054 : 81596 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4055 : : peel_iters_prologue = 0;
4056 : 81596 : else if (npeel < 0)
4057 : : {
4058 : 169 : peel_iters_prologue = assumed_vf / 2;
4059 : 169 : if (dump_enabled_p ())
4060 : 4 : dump_printf (MSG_NOTE, "cost model: "
4061 : : "prologue peel iters set to vf/2.\n");
4062 : :
4063 : : /* If peeled iterations are unknown, count a taken branch and a not taken
4064 : : branch per peeled loop. Even if scalar loop iterations are known,
4065 : : vector iterations are not known since peeled prologue iterations are
4066 : : not known. Hence guards remain the same. */
4067 : : prologue_need_br_taken_cost = true;
4068 : : prologue_need_br_not_taken_cost = true;
4069 : : }
4070 : : else
4071 : : {
4072 : 81427 : peel_iters_prologue = npeel;
4073 : 81427 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4074 : : /* If peeled iterations are known but number of scalar loop
4075 : : iterations are unknown, count a taken branch per peeled loop. */
4076 : 81596 : prologue_need_br_taken_cost = true;
4077 : : }
4078 : :
4079 : 81596 : bool epilogue_need_br_taken_cost = false;
4080 : 81596 : bool epilogue_need_br_not_taken_cost = false;
4081 : :
4082 : : /* Calculate peel_iters_epilogue. */
4083 : 81596 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4084 : : /* We need to peel exactly one iteration for gaps. */
4085 : 19 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4086 : 81577 : else if (npeel < 0)
4087 : : {
4088 : : /* If peeling for alignment is unknown, loop bound of main loop
4089 : : becomes unknown. */
4090 : 169 : peel_iters_epilogue = assumed_vf / 2;
4091 : 169 : if (dump_enabled_p ())
4092 : 4 : dump_printf (MSG_NOTE, "cost model: "
4093 : : "epilogue peel iters set to vf/2 because "
4094 : : "peeling for alignment is unknown.\n");
4095 : :
4096 : : /* See the same reason above in peel_iters_prologue calculation. */
4097 : : epilogue_need_br_taken_cost = true;
4098 : : epilogue_need_br_not_taken_cost = true;
4099 : : }
4100 : : else
4101 : : {
4102 : 81408 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4103 : 81408 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4104 : : /* If peeled iterations are known but number of scalar loop
4105 : : iterations are unknown, count a taken branch per peeled loop. */
4106 : 81596 : epilogue_need_br_taken_cost = true;
4107 : : }
4108 : :
4109 : 81596 : stmt_info_for_cost *si;
4110 : 81596 : int j;
4111 : : /* Add costs associated with peel_iters_prologue. */
4112 : 81596 : if (peel_iters_prologue)
4113 : 809 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4114 : : {
4115 : 631 : (void) add_stmt_cost (target_cost_data,
4116 : 631 : si->count * peel_iters_prologue, si->kind,
4117 : : si->stmt_info, si->node, si->vectype,
4118 : : si->misalign, vect_prologue);
4119 : : }
4120 : :
4121 : : /* Add costs associated with peel_iters_epilogue. */
4122 : 81596 : if (peel_iters_epilogue)
4123 : 277770 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4124 : : {
4125 : 220653 : (void) add_stmt_cost (target_cost_data,
4126 : 220653 : si->count * peel_iters_epilogue, si->kind,
4127 : : si->stmt_info, si->node, si->vectype,
4128 : : si->misalign, vect_epilogue);
4129 : : }
4130 : :
4131 : : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4132 : :
4133 : 81596 : if (prologue_need_br_taken_cost)
4134 : 170 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4135 : : vect_prologue);
4136 : :
4137 : 81596 : if (prologue_need_br_not_taken_cost)
4138 : 169 : (void) add_stmt_cost (target_cost_data, 1,
4139 : : cond_branch_not_taken, vect_prologue);
4140 : :
4141 : 81596 : if (epilogue_need_br_taken_cost)
4142 : 47638 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4143 : : vect_epilogue);
4144 : :
4145 : 81596 : if (epilogue_need_br_not_taken_cost)
4146 : 169 : (void) add_stmt_cost (target_cost_data, 1,
4147 : : cond_branch_not_taken, vect_epilogue);
4148 : :
4149 : : /* Take care of special costs for rgroup controls of partial vectors. */
4150 : 19 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4151 : 81615 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4152 : : == vect_partial_vectors_avx512))
4153 : : {
4154 : : /* Calculate how many masks we need to generate. */
4155 : 19 : unsigned int num_masks = 0;
4156 : 19 : bool need_saturation = false;
4157 : 78 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4158 : 21 : if (rgm.type)
4159 : : {
4160 : 19 : unsigned nvectors = rgm.factor;
4161 : 19 : num_masks += nvectors;
4162 : 19 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4163 : 19 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4164 : 5 : need_saturation = true;
4165 : : }
4166 : :
4167 : : /* ??? The target isn't able to identify the costs below as
4168 : : producing masks so it cannot penaltize cases where we'd run
4169 : : out of mask registers for example. */
4170 : :
4171 : : /* ??? We are also failing to account for smaller vector masks
4172 : : we generate by splitting larger masks in vect_get_loop_mask. */
4173 : :
4174 : : /* In the worst case, we need to generate each mask in the prologue
4175 : : and in the loop body. We need one splat per group and one
4176 : : compare per mask.
4177 : :
4178 : : Sometimes the prologue mask will fold to a constant,
4179 : : so the actual prologue cost might be smaller. However, it's
4180 : : simpler and safer to use the worst-case cost; if this ends up
4181 : : being the tie-breaker between vectorizing or not, then it's
4182 : : probably better not to vectorize. */
4183 : 19 : (void) add_stmt_cost (target_cost_data,
4184 : : num_masks
4185 : 19 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4186 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4187 : : vect_prologue);
4188 : 38 : (void) add_stmt_cost (target_cost_data,
4189 : : num_masks
4190 : 38 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4191 : : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4192 : :
4193 : : /* When we need saturation we need it both in the prologue and
4194 : : the epilogue. */
4195 : 19 : if (need_saturation)
4196 : : {
4197 : 5 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4198 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4199 : 5 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4200 : : NULL, NULL, NULL_TREE, 0, vect_body);
4201 : : }
4202 : : }
4203 : 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4204 : 81577 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4205 : : == vect_partial_vectors_while_ult))
4206 : : {
4207 : : /* Calculate how many masks we need to generate. */
4208 : : unsigned int num_masks = 0;
4209 : : rgroup_controls *rgm;
4210 : : unsigned int num_vectors_m1;
4211 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4212 : : num_vectors_m1, rgm)
4213 : 0 : if (rgm->type)
4214 : 0 : num_masks += num_vectors_m1 + 1;
4215 : 0 : gcc_assert (num_masks > 0);
4216 : :
4217 : : /* In the worst case, we need to generate each mask in the prologue
4218 : : and in the loop body. One of the loop body mask instructions
4219 : : replaces the comparison in the scalar loop, and since we don't
4220 : : count the scalar comparison against the scalar body, we shouldn't
4221 : : count that vector instruction against the vector body either.
4222 : :
4223 : : Sometimes we can use unpacks instead of generating prologue
4224 : : masks and sometimes the prologue mask will fold to a constant,
4225 : : so the actual prologue cost might be smaller. However, it's
4226 : : simpler and safer to use the worst-case cost; if this ends up
4227 : : being the tie-breaker between vectorizing or not, then it's
4228 : : probably better not to vectorize. */
4229 : 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4230 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4231 : : vect_prologue);
4232 : 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4233 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4234 : : vect_body);
4235 : : }
4236 : 81577 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4237 : : {
4238 : : /* Referring to the functions vect_set_loop_condition_partial_vectors
4239 : : and vect_set_loop_controls_directly, we need to generate each
4240 : : length in the prologue and in the loop body if required. Although
4241 : : there are some possible optimizations, we consider the worst case
4242 : : here. */
4243 : :
4244 : 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4245 : 0 : signed char partial_load_store_bias
4246 : : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4247 : 0 : bool need_iterate_p
4248 : 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4249 : 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4250 : :
4251 : : /* Calculate how many statements to be added. */
4252 : 0 : unsigned int prologue_stmts = 0;
4253 : 0 : unsigned int body_stmts = 0;
4254 : :
4255 : 0 : rgroup_controls *rgc;
4256 : 0 : unsigned int num_vectors_m1;
4257 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4258 : 0 : if (rgc->type)
4259 : : {
4260 : : /* May need one SHIFT for nitems_total computation. */
4261 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4262 : 0 : if (nitems != 1 && !niters_known_p)
4263 : 0 : prologue_stmts += 1;
4264 : :
4265 : : /* May need one MAX and one MINUS for wrap around. */
4266 : 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4267 : 0 : prologue_stmts += 2;
4268 : :
4269 : : /* Need one MAX and one MINUS for each batch limit excepting for
4270 : : the 1st one. */
4271 : 0 : prologue_stmts += num_vectors_m1 * 2;
4272 : :
4273 : 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4274 : :
4275 : : /* Need to set up lengths in prologue, only one MIN required
4276 : : for each since start index is zero. */
4277 : 0 : prologue_stmts += num_vectors;
4278 : :
4279 : : /* If we have a non-zero partial load bias, we need one PLUS
4280 : : to adjust the load length. */
4281 : 0 : if (partial_load_store_bias != 0)
4282 : 0 : body_stmts += 1;
4283 : :
4284 : 0 : unsigned int length_update_cost = 0;
4285 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4286 : : /* For decrement IV style, Each only need a single SELECT_VL
4287 : : or MIN since beginning to calculate the number of elements
4288 : : need to be processed in current iteration. */
4289 : : length_update_cost = 1;
4290 : : else
4291 : : /* For increment IV stype, Each may need two MINs and one MINUS to
4292 : : update lengths in body for next iteration. */
4293 : 0 : length_update_cost = 3;
4294 : :
4295 : 0 : if (need_iterate_p)
4296 : 0 : body_stmts += length_update_cost * num_vectors;
4297 : : }
4298 : :
4299 : 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4300 : : scalar_stmt, vect_prologue);
4301 : 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4302 : : scalar_stmt, vect_body);
4303 : : }
4304 : :
4305 : : /* FORNOW: The scalar outside cost is incremented in one of the
4306 : : following ways:
4307 : :
4308 : : 1. The vectorizer checks for alignment and aliasing and generates
4309 : : a condition that allows dynamic vectorization. A cost model
4310 : : check is ANDED with the versioning condition. Hence scalar code
4311 : : path now has the added cost of the versioning check.
4312 : :
4313 : : if (cost > th & versioning_check)
4314 : : jmp to vector code
4315 : :
4316 : : Hence run-time scalar is incremented by not-taken branch cost.
4317 : :
4318 : : 2. The vectorizer then checks if a prologue is required. If the
4319 : : cost model check was not done before during versioning, it has to
4320 : : be done before the prologue check.
4321 : :
4322 : : if (cost <= th)
4323 : : prologue = scalar_iters
4324 : : if (prologue == 0)
4325 : : jmp to vector code
4326 : : else
4327 : : execute prologue
4328 : : if (prologue == num_iters)
4329 : : go to exit
4330 : :
4331 : : Hence the run-time scalar cost is incremented by a taken branch,
4332 : : plus a not-taken branch, plus a taken branch cost.
4333 : :
4334 : : 3. The vectorizer then checks if an epilogue is required. If the
4335 : : cost model check was not done before during prologue check, it
4336 : : has to be done with the epilogue check.
4337 : :
4338 : : if (prologue == 0)
4339 : : jmp to vector code
4340 : : else
4341 : : execute prologue
4342 : : if (prologue == num_iters)
4343 : : go to exit
4344 : : vector code:
4345 : : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4346 : : jmp to epilogue
4347 : :
4348 : : Hence the run-time scalar cost should be incremented by 2 taken
4349 : : branches.
4350 : :
4351 : : TODO: The back end may reorder the BBS's differently and reverse
4352 : : conditions/branch directions. Change the estimates below to
4353 : : something more reasonable. */
4354 : :
4355 : : /* If the number of iterations is known and we do not do versioning, we can
4356 : : decide whether to vectorize at compile time. Hence the scalar version
4357 : : do not carry cost model guard costs. */
4358 : 33428 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4359 : 115024 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4360 : : {
4361 : : /* Cost model check occurs at versioning. */
4362 : 48771 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4363 : 4754 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4364 : : else
4365 : : {
4366 : : /* Cost model check occurs at prologue generation. */
4367 : 44017 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4368 : 26 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4369 : 26 : + vect_get_stmt_cost (cond_branch_not_taken);
4370 : : /* Cost model check occurs at epilogue generation. */
4371 : : else
4372 : 43991 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4373 : : }
4374 : : }
4375 : :
4376 : : /* Complete the target-specific cost calculations. */
4377 : 81596 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
4378 : 81596 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
4379 : 81596 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
4380 : 81596 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
4381 : 81596 : if (suggested_unroll_factor)
4382 : 81416 : *suggested_unroll_factor
4383 : 81416 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
4384 : :
4385 : 81416 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
4386 : 236 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4387 : 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4388 : : *suggested_unroll_factor,
4389 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4390 : : {
4391 : 0 : if (dump_enabled_p ())
4392 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4393 : : "can't unroll as unrolled vectorization factor larger"
4394 : : " than maximum vectorization factor: "
4395 : : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4396 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4397 : 0 : *suggested_unroll_factor = 1;
4398 : : }
4399 : :
4400 : 81596 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4401 : :
4402 : 81596 : if (dump_enabled_p ())
4403 : : {
4404 : 614 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4405 : 614 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4406 : : vec_inside_cost);
4407 : 614 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4408 : : vec_prologue_cost);
4409 : 614 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4410 : : vec_epilogue_cost);
4411 : 614 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4412 : : scalar_single_iter_cost);
4413 : 614 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4414 : : scalar_outside_cost);
4415 : 614 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4416 : : vec_outside_cost);
4417 : 614 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4418 : : peel_iters_prologue);
4419 : 614 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4420 : : peel_iters_epilogue);
4421 : : }
4422 : :
4423 : : /* Calculate number of iterations required to make the vector version
4424 : : profitable, relative to the loop bodies only. The following condition
4425 : : must hold true:
4426 : : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4427 : : where
4428 : : SIC = scalar iteration cost, VIC = vector iteration cost,
4429 : : VOC = vector outside cost, VF = vectorization factor,
4430 : : NPEEL = prologue iterations + epilogue iterations,
4431 : : SOC = scalar outside cost for run time cost model check. */
4432 : :
4433 : 81596 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4434 : 81596 : - vec_inside_cost);
4435 : 81596 : if (saving_per_viter <= 0)
4436 : : {
4437 : 25385 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4438 : 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4439 : : "vectorization did not happen for a simd loop");
4440 : :
4441 : 25385 : if (dump_enabled_p ())
4442 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4443 : : "cost model: the vector iteration cost = %d "
4444 : : "divided by the scalar iteration cost = %d "
4445 : : "is greater or equal to the vectorization factor = %d"
4446 : : ".\n",
4447 : : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4448 : 25385 : *ret_min_profitable_niters = -1;
4449 : 25385 : *ret_min_profitable_estimate = -1;
4450 : 25385 : return;
4451 : : }
4452 : :
4453 : : /* ??? The "if" arm is written to handle all cases; see below for what
4454 : : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4455 : 56211 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4456 : : {
4457 : : /* Rewriting the condition above in terms of the number of
4458 : : vector iterations (vniters) rather than the number of
4459 : : scalar iterations (niters) gives:
4460 : :
4461 : : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4462 : :
4463 : : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4464 : :
4465 : : For integer N, X and Y when X > 0:
4466 : :
4467 : : N * X > Y <==> N >= (Y /[floor] X) + 1. */
4468 : 11 : int outside_overhead = (vec_outside_cost
4469 : 11 : - scalar_single_iter_cost * peel_iters_prologue
4470 : 11 : - scalar_single_iter_cost * peel_iters_epilogue
4471 : : - scalar_outside_cost);
4472 : : /* We're only interested in cases that require at least one
4473 : : vector iteration. */
4474 : 11 : int min_vec_niters = 1;
4475 : 11 : if (outside_overhead > 0)
4476 : 8 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4477 : :
4478 : 11 : if (dump_enabled_p ())
4479 : 3 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4480 : : min_vec_niters);
4481 : :
4482 : 11 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4483 : : {
4484 : : /* Now that we know the minimum number of vector iterations,
4485 : : find the minimum niters for which the scalar cost is larger:
4486 : :
4487 : : SIC * niters > VIC * vniters + VOC - SOC
4488 : :
4489 : : We know that the minimum niters is no more than
4490 : : vniters * VF + NPEEL, but it might be (and often is) less
4491 : : than that if a partial vector iteration is cheaper than the
4492 : : equivalent scalar code. */
4493 : 11 : int threshold = (vec_inside_cost * min_vec_niters
4494 : 11 : + vec_outside_cost
4495 : 11 : - scalar_outside_cost);
4496 : 11 : if (threshold <= 0)
4497 : : min_profitable_iters = 1;
4498 : : else
4499 : 11 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4500 : : }
4501 : : else
4502 : : /* Convert the number of vector iterations into a number of
4503 : : scalar iterations. */
4504 : 0 : min_profitable_iters = (min_vec_niters * assumed_vf
4505 : 0 : + peel_iters_prologue
4506 : : + peel_iters_epilogue);
4507 : : }
4508 : : else
4509 : : {
4510 : 56200 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4511 : 56200 : * assumed_vf
4512 : 56200 : - vec_inside_cost * peel_iters_prologue
4513 : 56200 : - vec_inside_cost * peel_iters_epilogue);
4514 : 56200 : if (min_profitable_iters <= 0)
4515 : : min_profitable_iters = 0;
4516 : : else
4517 : : {
4518 : 47284 : min_profitable_iters /= saving_per_viter;
4519 : :
4520 : 47284 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4521 : 47284 : <= (((int) vec_inside_cost * min_profitable_iters)
4522 : 47284 : + (((int) vec_outside_cost - scalar_outside_cost)
4523 : : * assumed_vf)))
4524 : 47284 : min_profitable_iters++;
4525 : : }
4526 : : }
4527 : :
4528 : 56211 : if (dump_enabled_p ())
4529 : 592 : dump_printf (MSG_NOTE,
4530 : : " Calculated minimum iters for profitability: %d\n",
4531 : : min_profitable_iters);
4532 : :
4533 : 56211 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4534 : 56200 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4535 : : /* We want the vectorized loop to execute at least once. */
4536 : : min_profitable_iters = assumed_vf + peel_iters_prologue;
4537 : 10105 : else if (min_profitable_iters < peel_iters_prologue)
4538 : : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4539 : : vectorized loop executes at least once. */
4540 : : min_profitable_iters = peel_iters_prologue;
4541 : :
4542 : 56211 : if (dump_enabled_p ())
4543 : 592 : dump_printf_loc (MSG_NOTE, vect_location,
4544 : : " Runtime profitability threshold = %d\n",
4545 : : min_profitable_iters);
4546 : :
4547 : 56211 : *ret_min_profitable_niters = min_profitable_iters;
4548 : :
4549 : : /* Calculate number of iterations required to make the vector version
4550 : : profitable, relative to the loop bodies only.
4551 : :
4552 : : Non-vectorized variant is SIC * niters and it must win over vector
4553 : : variant on the expected loop trip count. The following condition must hold true:
4554 : : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4555 : :
4556 : 56211 : if (vec_outside_cost <= 0)
4557 : : min_profitable_estimate = 0;
4558 : : /* ??? This "else if" arm is written to handle all cases; see below for
4559 : : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4560 : 50894 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4561 : : {
4562 : : /* This is a repeat of the code above, but with + SOC rather
4563 : : than - SOC. */
4564 : 11 : int outside_overhead = (vec_outside_cost
4565 : 11 : - scalar_single_iter_cost * peel_iters_prologue
4566 : 11 : - scalar_single_iter_cost * peel_iters_epilogue
4567 : : + scalar_outside_cost);
4568 : 11 : int min_vec_niters = 1;
4569 : 11 : if (outside_overhead > 0)
4570 : 11 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4571 : :
4572 : 11 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4573 : : {
4574 : 11 : int threshold = (vec_inside_cost * min_vec_niters
4575 : 11 : + vec_outside_cost
4576 : 11 : + scalar_outside_cost);
4577 : 11 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4578 : : }
4579 : : else
4580 : : min_profitable_estimate = (min_vec_niters * assumed_vf
4581 : : + peel_iters_prologue
4582 : : + peel_iters_epilogue);
4583 : : }
4584 : : else
4585 : : {
4586 : 50883 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4587 : 50883 : * assumed_vf
4588 : 50883 : - vec_inside_cost * peel_iters_prologue
4589 : 50883 : - vec_inside_cost * peel_iters_epilogue)
4590 : 50883 : / ((scalar_single_iter_cost * assumed_vf)
4591 : : - vec_inside_cost);
4592 : : }
4593 : 56211 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4594 : 56211 : if (dump_enabled_p ())
4595 : 592 : dump_printf_loc (MSG_NOTE, vect_location,
4596 : : " Static estimate profitability threshold = %d\n",
4597 : : min_profitable_estimate);
4598 : :
4599 : 56211 : *ret_min_profitable_estimate = min_profitable_estimate;
4600 : : }
4601 : :
4602 : : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4603 : : vector elements (not bits) for a vector with NELT elements. */
4604 : : static void
4605 : 2049 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4606 : : vec_perm_builder *sel)
4607 : : {
4608 : : /* The encoding is a single stepped pattern. Any wrap-around is handled
4609 : : by vec_perm_indices. */
4610 : 2049 : sel->new_vector (nelt, 1, 3);
4611 : 8196 : for (unsigned int i = 0; i < 3; i++)
4612 : 6147 : sel->quick_push (i + offset);
4613 : 2049 : }
4614 : :
4615 : : /* Checks whether the target supports whole-vector shifts for vectors of mode
4616 : : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4617 : : it supports vec_perm_const with masks for all necessary shift amounts. */
4618 : : static bool
4619 : 7158 : have_whole_vector_shift (machine_mode mode)
4620 : : {
4621 : 7158 : if (can_implement_p (vec_shr_optab, mode))
4622 : : return true;
4623 : :
4624 : : /* Variable-length vectors should be handled via the optab. */
4625 : 55 : unsigned int nelt;
4626 : 110 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4627 : : return false;
4628 : :
4629 : 55 : vec_perm_builder sel;
4630 : 55 : vec_perm_indices indices;
4631 : 285 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4632 : : {
4633 : 230 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
4634 : 230 : indices.new_vector (sel, 2, nelt);
4635 : 230 : if (!can_vec_perm_const_p (mode, mode, indices, false))
4636 : : return false;
4637 : : }
4638 : : return true;
4639 : 55 : }
4640 : :
4641 : : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4642 : : multiplication operands have differing signs and (b) we intend
4643 : : to emulate the operation using a series of signed DOT_PROD_EXPRs.
4644 : : See vect_emulate_mixed_dot_prod for the actual sequence used. */
4645 : :
4646 : : static bool
4647 : 1961 : vect_is_emulated_mixed_dot_prod (slp_tree slp_node)
4648 : : {
4649 : 1961 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
4650 : 1961 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4651 : 1674 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4652 : : return false;
4653 : :
4654 : 557 : tree rhs1 = gimple_assign_rhs1 (assign);
4655 : 557 : tree rhs2 = gimple_assign_rhs2 (assign);
4656 : 557 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4657 : : return false;
4658 : :
4659 : 399 : return !directly_supported_p (DOT_PROD_EXPR,
4660 : : SLP_TREE_VECTYPE (slp_node),
4661 : 133 : SLP_TREE_VECTYPE
4662 : : (SLP_TREE_CHILDREN (slp_node)[0]),
4663 : 133 : optab_vector_mixed_sign);
4664 : : }
4665 : :
4666 : : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4667 : : functions. Design better to avoid maintenance issues. */
4668 : :
4669 : : /* Function vect_model_reduction_cost.
4670 : :
4671 : : Models cost for a reduction operation, including the vector ops
4672 : : generated within the strip-mine loop in some cases, the initial
4673 : : definition before the loop, and the epilogue code that must be generated. */
4674 : :
4675 : : static void
4676 : 43234 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
4677 : : slp_tree node, internal_fn reduc_fn,
4678 : : vect_reduction_type reduction_type,
4679 : : int ncopies, stmt_vector_for_cost *cost_vec)
4680 : : {
4681 : 43234 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4682 : 43234 : tree vectype;
4683 : 43234 : machine_mode mode;
4684 : 43234 : class loop *loop = NULL;
4685 : :
4686 : 43234 : if (loop_vinfo)
4687 : 43234 : loop = LOOP_VINFO_LOOP (loop_vinfo);
4688 : :
4689 : : /* Condition reductions generate two reductions in the loop. */
4690 : 43234 : if (reduction_type == COND_REDUCTION)
4691 : 215 : ncopies *= 2;
4692 : :
4693 : 43234 : vectype = SLP_TREE_VECTYPE (node);
4694 : 43234 : mode = TYPE_MODE (vectype);
4695 : 43234 : stmt_vec_info orig_stmt_info
4696 : 43234 : = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
4697 : :
4698 : 43234 : gimple_match_op op;
4699 : 43234 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4700 : 0 : gcc_unreachable ();
4701 : :
4702 : 43234 : if (reduction_type == EXTRACT_LAST_REDUCTION)
4703 : : /* No extra instructions are needed in the prologue. The loop body
4704 : : operations are costed in vectorizable_condition. */
4705 : : inside_cost = 0;
4706 : 43234 : else if (reduction_type == FOLD_LEFT_REDUCTION)
4707 : : {
4708 : : /* No extra instructions needed in the prologue. */
4709 : 4167 : prologue_cost = 0;
4710 : :
4711 : 4167 : if (reduc_fn != IFN_LAST)
4712 : : /* Count one reduction-like operation per vector. */
4713 : 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4714 : : node, 0, vect_body);
4715 : : else
4716 : : {
4717 : : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4718 : 4167 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4719 : 4167 : inside_cost = record_stmt_cost (cost_vec, nelements,
4720 : : vec_to_scalar, node, 0,
4721 : : vect_body);
4722 : 4167 : inside_cost += record_stmt_cost (cost_vec, nelements,
4723 : : scalar_stmt, node, 0,
4724 : : vect_body);
4725 : : }
4726 : : }
4727 : : else
4728 : : {
4729 : : /* Add in the cost of the initial definitions. */
4730 : 39067 : int prologue_stmts;
4731 : 39067 : if (reduction_type == COND_REDUCTION)
4732 : : /* For cond reductions we have four vectors: initial index, step,
4733 : : initial result of the data reduction, initial value of the index
4734 : : reduction. */
4735 : : prologue_stmts = 4;
4736 : : else
4737 : : /* We need the initial reduction value. */
4738 : 38852 : prologue_stmts = 1;
4739 : 39067 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4740 : : scalar_to_vec, node, 0,
4741 : : vect_prologue);
4742 : : }
4743 : :
4744 : : /* Determine cost of epilogue code.
4745 : :
4746 : : We have a reduction operator that will reduce the vector in one statement.
4747 : : Also requires scalar extract. */
4748 : :
4749 : 43234 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4750 : : {
4751 : 43099 : if (reduc_fn != IFN_LAST)
4752 : : {
4753 : 31838 : if (reduction_type == COND_REDUCTION)
4754 : : {
4755 : : /* An EQ stmt and an COND_EXPR stmt. */
4756 : 7 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4757 : : vector_stmt, node, 0,
4758 : : vect_epilogue);
4759 : : /* Reduction of the max index and a reduction of the found
4760 : : values. */
4761 : 7 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4762 : : vec_to_scalar, node, 0,
4763 : : vect_epilogue);
4764 : : /* A broadcast of the max value. */
4765 : 7 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4766 : : scalar_to_vec, node, 0,
4767 : : vect_epilogue);
4768 : : }
4769 : : else
4770 : : {
4771 : 31831 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4772 : : node, 0, vect_epilogue);
4773 : 31831 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4774 : : vec_to_scalar, node, 0,
4775 : : vect_epilogue);
4776 : : }
4777 : : }
4778 : 11261 : else if (reduction_type == COND_REDUCTION)
4779 : : {
4780 : 208 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4781 : : /* Extraction of scalar elements. */
4782 : 416 : epilogue_cost += record_stmt_cost (cost_vec,
4783 : 208 : 2 * estimated_nunits,
4784 : : vec_to_scalar, node, 0,
4785 : : vect_epilogue);
4786 : : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4787 : 208 : epilogue_cost += record_stmt_cost (cost_vec,
4788 : 208 : 2 * estimated_nunits - 3,
4789 : : scalar_stmt, node, 0,
4790 : : vect_epilogue);
4791 : : }
4792 : 11053 : else if (reduction_type == EXTRACT_LAST_REDUCTION
4793 : 11053 : || reduction_type == FOLD_LEFT_REDUCTION)
4794 : : /* No extra instructions need in the epilogue. */
4795 : : ;
4796 : : else
4797 : : {
4798 : 6886 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4799 : 6886 : tree bitsize = TYPE_SIZE (op.type);
4800 : 6886 : int element_bitsize = tree_to_uhwi (bitsize);
4801 : 6886 : int nelements = vec_size_in_bits / element_bitsize;
4802 : :
4803 : 6886 : if (op.code == COND_EXPR)
4804 : 28 : op.code = MAX_EXPR;
4805 : :
4806 : : /* We have a whole vector shift available. */
4807 : 841 : if (VECTOR_MODE_P (mode)
4808 : 6886 : && directly_supported_p (op.code, vectype)
4809 : 12309 : && have_whole_vector_shift (mode))
4810 : : {
4811 : : /* Final reduction via vector shifts and the reduction operator.
4812 : : Also requires scalar extract. */
4813 : 16269 : epilogue_cost += record_stmt_cost (cost_vec,
4814 : 10846 : exact_log2 (nelements) * 2,
4815 : : vector_stmt, node, 0,
4816 : : vect_epilogue);
4817 : 5423 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4818 : : vec_to_scalar, node, 0,
4819 : : vect_epilogue);
4820 : : }
4821 : : else
4822 : : /* Use extracts and reduction op for final reduction. For N
4823 : : elements, we have N extracts and N-1 reduction ops. */
4824 : 1463 : epilogue_cost += record_stmt_cost (cost_vec,
4825 : 1463 : nelements + nelements - 1,
4826 : : vector_stmt, node, 0,
4827 : : vect_epilogue);
4828 : : }
4829 : : }
4830 : :
4831 : 43234 : if (dump_enabled_p ())
4832 : 2571 : dump_printf (MSG_NOTE,
4833 : : "vect_model_reduction_cost: inside_cost = %d, "
4834 : : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4835 : : prologue_cost, epilogue_cost);
4836 : 43234 : }
4837 : :
4838 : : /* SEQ is a sequence of instructions that initialize the reduction
4839 : : described by REDUC_INFO. Emit them in the appropriate place. */
4840 : :
4841 : : static void
4842 : 419 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4843 : : vect_reduc_info reduc_info, gimple *seq)
4844 : : {
4845 : 419 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
4846 : : {
4847 : : /* When reusing an accumulator from the main loop, we only need
4848 : : initialization instructions if the main loop can be skipped.
4849 : : In that case, emit the initialization instructions at the end
4850 : : of the guard block that does the skip. */
4851 : 23 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
4852 : 23 : gcc_assert (skip_edge);
4853 : 23 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4854 : 23 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4855 : : }
4856 : : else
4857 : : {
4858 : : /* The normal case: emit the initialization instructions on the
4859 : : preheader edge. */
4860 : 396 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4861 : 396 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4862 : : }
4863 : 419 : }
4864 : :
4865 : : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4866 : : which performs a reduction involving GROUP_SIZE scalar statements.
4867 : : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4868 : : is nonnull, introducing extra elements of that value will not change the
4869 : : result. */
4870 : :
4871 : : static void
4872 : 21071 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4873 : : vect_reduc_info reduc_info,
4874 : : tree vector_type,
4875 : : vec<tree> *vec_oprnds,
4876 : : unsigned int number_of_vectors,
4877 : : unsigned int group_size, tree neutral_op)
4878 : : {
4879 : 21071 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
4880 : 21071 : unsigned HOST_WIDE_INT nunits;
4881 : 21071 : unsigned j, number_of_places_left_in_vector;
4882 : 21071 : unsigned int i;
4883 : :
4884 : 42142 : gcc_assert (group_size == initial_values.length () || neutral_op);
4885 : :
4886 : : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4887 : : created vectors. It is greater than 1 if unrolling is performed.
4888 : :
4889 : : For example, we have two scalar operands, s1 and s2 (e.g., group of
4890 : : strided accesses of size two), while NUNITS is four (i.e., four scalars
4891 : : of this type can be packed in a vector). The output vector will contain
4892 : : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4893 : : will be 2).
4894 : :
4895 : : If GROUP_SIZE > NUNITS, the scalars will be split into several
4896 : : vectors containing the operands.
4897 : :
4898 : : For example, NUNITS is four as before, and the group size is 8
4899 : : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4900 : : {s5, s6, s7, s8}. */
4901 : :
4902 : 21071 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4903 : : nunits = group_size;
4904 : :
4905 : 21071 : number_of_places_left_in_vector = nunits;
4906 : 21071 : bool constant_p = true;
4907 : 21071 : tree_vector_builder elts (vector_type, nunits, 1);
4908 : 21071 : elts.quick_grow (nunits);
4909 : 21071 : gimple_seq ctor_seq = NULL;
4910 : 21071 : if (neutral_op
4911 : 42057 : && !useless_type_conversion_p (TREE_TYPE (vector_type),
4912 : 20986 : TREE_TYPE (neutral_op)))
4913 : 1 : neutral_op = gimple_convert (&ctor_seq,
4914 : 1 : TREE_TYPE (vector_type),
4915 : : neutral_op);
4916 : 211019 : for (j = 0; j < nunits * number_of_vectors; ++j)
4917 : : {
4918 : 189948 : tree op;
4919 : 189948 : i = j % group_size;
4920 : :
4921 : : /* Get the def before the loop. In reduction chain we have only
4922 : : one initial value. Else we have as many as PHIs in the group. */
4923 : 189948 : if (i >= initial_values.length () || (j > i && neutral_op))
4924 : : op = neutral_op;
4925 : : else
4926 : : {
4927 : 43466 : if (!useless_type_conversion_p (TREE_TYPE (vector_type),
4928 : 21733 : TREE_TYPE (initial_values[i])))
4929 : 46 : initial_values[i] = gimple_convert (&ctor_seq,
4930 : 23 : TREE_TYPE (vector_type),
4931 : 23 : initial_values[i]);
4932 : 21733 : op = initial_values[i];
4933 : : }
4934 : :
4935 : : /* Create 'vect_ = {op0,op1,...,opn}'. */
4936 : 189948 : number_of_places_left_in_vector--;
4937 : 189948 : elts[nunits - number_of_places_left_in_vector - 1] = op;
4938 : 189948 : if (!CONSTANT_CLASS_P (op))
4939 : 2257 : constant_p = false;
4940 : :
4941 : 189948 : if (number_of_places_left_in_vector == 0)
4942 : : {
4943 : 24332 : tree init;
4944 : 48664 : if (constant_p && !neutral_op
4945 : 48613 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4946 : 24332 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4947 : : /* Build the vector directly from ELTS. */
4948 : 24332 : init = gimple_build_vector (&ctor_seq, &elts);
4949 : 0 : else if (neutral_op)
4950 : : {
4951 : : /* Build a vector of the neutral value and shift the
4952 : : other elements into place. */
4953 : 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4954 : : neutral_op);
4955 : 0 : int k = nunits;
4956 : 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
4957 : : k -= 1;
4958 : 0 : while (k > 0)
4959 : : {
4960 : 0 : k -= 1;
4961 : 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4962 : 0 : vector_type, init, elts[k]);
4963 : : }
4964 : : }
4965 : : else
4966 : : {
4967 : : /* First time round, duplicate ELTS to fill the
4968 : : required number of vectors. */
4969 : 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4970 : : elts, number_of_vectors, *vec_oprnds);
4971 : 0 : break;
4972 : : }
4973 : 24332 : vec_oprnds->quick_push (init);
4974 : :
4975 : 24332 : number_of_places_left_in_vector = nunits;
4976 : 24332 : elts.new_vector (vector_type, nunits, 1);
4977 : 24332 : elts.quick_grow (nunits);
4978 : 24332 : constant_p = true;
4979 : : }
4980 : : }
4981 : 21071 : if (ctor_seq != NULL)
4982 : 419 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4983 : 21071 : }
4984 : :
4985 : : vect_reduc_info
4986 : 128855 : info_for_reduction (loop_vec_info loop_vinfo, slp_tree node)
4987 : : {
4988 : 128855 : if (node->cycle_info.id == -1)
4989 : : return NULL;
4990 : 127099 : return loop_vinfo->reduc_infos[node->cycle_info.id];
4991 : : }
4992 : :
4993 : : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4994 : : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
4995 : : return false. */
4996 : :
4997 : : static bool
4998 : 18850 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4999 : : vect_reduc_info reduc_info, tree vectype)
5000 : : {
5001 : 18850 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5002 : 18850 : if (!main_loop_vinfo)
5003 : : return false;
5004 : :
5005 : 4050 : if (VECT_REDUC_INFO_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5006 : : return false;
5007 : :
5008 : 4033 : unsigned int num_phis = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).length ();
5009 : 4033 : auto_vec<tree, 16> main_loop_results (num_phis);
5010 : 4033 : auto_vec<tree, 16> initial_values (num_phis);
5011 : 4033 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5012 : : {
5013 : : /* The epilogue loop can be entered either from the main loop or
5014 : : from an earlier guard block. */
5015 : 3846 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5016 : 15404 : for (tree incoming_value : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info))
5017 : : {
5018 : : /* Look for:
5019 : :
5020 : : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5021 : : INITIAL_VALUE(guard block)>. */
5022 : 3866 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5023 : :
5024 : 3866 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5025 : 3866 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5026 : :
5027 : 3866 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5028 : 3866 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5029 : :
5030 : 3866 : main_loop_results.quick_push (from_main_loop);
5031 : 3866 : initial_values.quick_push (from_skip);
5032 : : }
5033 : : }
5034 : : else
5035 : : /* The main loop dominates the epilogue loop. */
5036 : 187 : main_loop_results.splice (VECT_REDUC_INFO_INITIAL_VALUES (reduc_info));
5037 : :
5038 : : /* See if the main loop has the kind of accumulator we need. */
5039 : 4033 : vect_reusable_accumulator *accumulator
5040 : 4033 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5041 : 4033 : if (!accumulator
5042 : 8052 : || num_phis != VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).length ()
5043 : 12085 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5044 : : VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).begin ()))
5045 : : return false;
5046 : :
5047 : : /* Handle the case where we can reduce wider vectors to narrower ones. */
5048 : 4022 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5049 : 4022 : unsigned HOST_WIDE_INT m;
5050 : 4022 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5051 : 4022 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5052 : 0 : return false;
5053 : : /* Check the intermediate vector types and operations are available. */
5054 : 4022 : tree prev_vectype = old_vectype;
5055 : 4022 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5056 : 11546 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5057 : : {
5058 : 4022 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5059 : 4022 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5060 : 4022 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5061 : 4022 : if (!intermediate_vectype
5062 : 4022 : || !directly_supported_p (VECT_REDUC_INFO_CODE (reduc_info),
5063 : : intermediate_vectype)
5064 : 7524 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5065 : 3502 : TYPE_MODE (intermediate_vectype)))
5066 : : return false;
5067 : : prev_vectype = intermediate_vectype;
5068 : : }
5069 : :
5070 : : /* Non-SLP reductions might apply an adjustment after the reduction
5071 : : operation, in order to simplify the initialization of the accumulator.
5072 : : If the epilogue loop carries on from where the main loop left off,
5073 : : it should apply the same adjustment to the final reduction result.
5074 : :
5075 : : If the epilogue loop can also be entered directly (rather than via
5076 : : the main loop), we need to be able to handle that case in the same way,
5077 : : with the same adjustment. (In principle we could add a PHI node
5078 : : to select the correct adjustment, but in practice that shouldn't be
5079 : : necessary.) */
5080 : 3502 : tree main_adjustment
5081 : 3502 : = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5082 : 3502 : if (loop_vinfo->main_loop_edge && main_adjustment)
5083 : : {
5084 : 2912 : gcc_assert (num_phis == 1);
5085 : 2912 : tree initial_value = initial_values[0];
5086 : : /* Check that we can use INITIAL_VALUE as the adjustment and
5087 : : initialize the accumulator with a neutral value instead. */
5088 : 2912 : if (!operand_equal_p (initial_value, main_adjustment))
5089 : 106 : return false;
5090 : 2806 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5091 : 2806 : initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5092 : : code, initial_value);
5093 : : }
5094 : 3396 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5095 : 3396 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).truncate (0);
5096 : 3396 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).splice (initial_values);
5097 : 3396 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info) = accumulator;
5098 : 3396 : return true;
5099 : 4033 : }
5100 : :
5101 : : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5102 : : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5103 : :
5104 : : static tree
5105 : 5129 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5106 : : gimple_seq *seq)
5107 : : {
5108 : 5129 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5109 : 5129 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5110 : 5129 : tree stype = TREE_TYPE (vectype);
5111 : 5129 : tree new_temp = vec_def;
5112 : 8564 : while (nunits > nunits1)
5113 : : {
5114 : 3435 : nunits /= 2;
5115 : 3435 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5116 : 3435 : stype, nunits);
5117 : 3435 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5118 : :
5119 : : /* The target has to make sure we support lowpart/highpart
5120 : : extraction, either via direct vector extract or through
5121 : : an integer mode punning. */
5122 : 3435 : tree dst1, dst2;
5123 : 3435 : gimple *epilog_stmt;
5124 : 3435 : if (convert_optab_handler (vec_extract_optab,
5125 : 3435 : TYPE_MODE (TREE_TYPE (new_temp)),
5126 : 3435 : TYPE_MODE (vectype1))
5127 : : != CODE_FOR_nothing)
5128 : : {
5129 : : /* Extract sub-vectors directly once vec_extract becomes
5130 : : a conversion optab. */
5131 : 2104 : dst1 = make_ssa_name (vectype1);
5132 : 2104 : epilog_stmt
5133 : 4208 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5134 : : build3 (BIT_FIELD_REF, vectype1,
5135 : 2104 : new_temp, TYPE_SIZE (vectype1),
5136 : : bitsize_int (0)));
5137 : 2104 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5138 : 2104 : dst2 = make_ssa_name (vectype1);
5139 : 2104 : epilog_stmt
5140 : 2104 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5141 : : build3 (BIT_FIELD_REF, vectype1,
5142 : 2104 : new_temp, TYPE_SIZE (vectype1),
5143 : 2104 : bitsize_int (bitsize)));
5144 : 2104 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5145 : : }
5146 : : else
5147 : : {
5148 : : /* Extract via punning to appropriately sized integer mode
5149 : : vector. */
5150 : 1331 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5151 : 1331 : tree etype = build_vector_type (eltype, 2);
5152 : 2662 : gcc_assert (convert_optab_handler (vec_extract_optab,
5153 : : TYPE_MODE (etype),
5154 : : TYPE_MODE (eltype))
5155 : : != CODE_FOR_nothing);
5156 : 1331 : tree tem = make_ssa_name (etype);
5157 : 1331 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5158 : : build1 (VIEW_CONVERT_EXPR,
5159 : : etype, new_temp));
5160 : 1331 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5161 : 1331 : new_temp = tem;
5162 : 1331 : tem = make_ssa_name (eltype);
5163 : 1331 : epilog_stmt
5164 : 2662 : = gimple_build_assign (tem, BIT_FIELD_REF,
5165 : : build3 (BIT_FIELD_REF, eltype,
5166 : 1331 : new_temp, TYPE_SIZE (eltype),
5167 : : bitsize_int (0)));
5168 : 1331 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5169 : 1331 : dst1 = make_ssa_name (vectype1);
5170 : 1331 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5171 : : build1 (VIEW_CONVERT_EXPR,
5172 : : vectype1, tem));
5173 : 1331 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5174 : 1331 : tem = make_ssa_name (eltype);
5175 : 1331 : epilog_stmt
5176 : 1331 : = gimple_build_assign (tem, BIT_FIELD_REF,
5177 : : build3 (BIT_FIELD_REF, eltype,
5178 : 1331 : new_temp, TYPE_SIZE (eltype),
5179 : 1331 : bitsize_int (bitsize)));
5180 : 1331 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5181 : 1331 : dst2 = make_ssa_name (vectype1);
5182 : 1331 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5183 : : build1 (VIEW_CONVERT_EXPR,
5184 : : vectype1, tem));
5185 : 1331 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5186 : : }
5187 : :
5188 : 3435 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5189 : : }
5190 : :
5191 : 5129 : return new_temp;
5192 : : }
5193 : :
5194 : : /* Function vect_create_epilog_for_reduction
5195 : :
5196 : : Create code at the loop-epilog to finalize the result of a reduction
5197 : : computation.
5198 : :
5199 : : STMT_INFO is the scalar reduction stmt that is being vectorized.
5200 : : SLP_NODE is an SLP node containing a group of reduction statements. The
5201 : : first one in this group is STMT_INFO.
5202 : : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5203 : : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5204 : : (counting from 0)
5205 : : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5206 : : exit this edge is always the main loop exit.
5207 : :
5208 : : This function:
5209 : : 1. Completes the reduction def-use cycles.
5210 : : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5211 : : by calling the function specified by REDUC_FN if available, or by
5212 : : other means (whole-vector shifts or a scalar loop).
5213 : : The function also creates a new phi node at the loop exit to preserve
5214 : : loop-closed form, as illustrated below.
5215 : :
5216 : : The flow at the entry to this function:
5217 : :
5218 : : loop:
5219 : : vec_def = phi <vec_init, null> # REDUCTION_PHI
5220 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5221 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5222 : : loop_exit:
5223 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5224 : : use <s_out0>
5225 : : use <s_out0>
5226 : :
5227 : : The above is transformed by this function into:
5228 : :
5229 : : loop:
5230 : : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5231 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5232 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5233 : : loop_exit:
5234 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5235 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5236 : : v_out2 = reduce <v_out1>
5237 : : s_out3 = extract_field <v_out2, 0>
5238 : : s_out4 = adjust_result <s_out3>
5239 : : use <s_out4>
5240 : : use <s_out4>
5241 : : */
5242 : :
5243 : : static void
5244 : 21394 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5245 : : stmt_vec_info stmt_info,
5246 : : slp_tree slp_node,
5247 : : slp_instance slp_node_instance,
5248 : : edge loop_exit)
5249 : : {
5250 : 21394 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
5251 : 21394 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5252 : 21394 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
5253 : 21394 : tree vectype;
5254 : 21394 : machine_mode mode;
5255 : 21394 : basic_block exit_bb;
5256 : 21394 : gimple *new_phi = NULL, *phi = NULL;
5257 : 21394 : gimple_stmt_iterator exit_gsi;
5258 : 21394 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5259 : 21394 : gimple *epilog_stmt = NULL;
5260 : 21394 : gimple *exit_phi;
5261 : 21394 : tree def;
5262 : 21394 : tree orig_name, scalar_result;
5263 : 21394 : imm_use_iterator imm_iter;
5264 : 21394 : use_operand_p use_p;
5265 : 21394 : gimple *use_stmt;
5266 : 21394 : auto_vec<tree> reduc_inputs;
5267 : 21394 : int j, i;
5268 : 21394 : vec<tree> &scalar_results = VECT_REDUC_INFO_SCALAR_RESULTS (reduc_info);
5269 : 21394 : unsigned int k;
5270 : : /* SLP reduction without reduction chain, e.g.,
5271 : : # a1 = phi <a2, a0>
5272 : : # b1 = phi <b2, b0>
5273 : : a2 = operation (a1)
5274 : : b2 = operation (b1) */
5275 : 21394 : const bool slp_reduc = !reduc_info->is_reduc_chain;
5276 : 21394 : tree induction_index = NULL_TREE;
5277 : :
5278 : 21394 : unsigned int group_size = SLP_TREE_LANES (slp_node);
5279 : :
5280 : 21394 : bool double_reduc = false;
5281 : 21394 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5282 : 21394 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5283 : : {
5284 : 0 : double_reduc = true;
5285 : 0 : gcc_assert (slp_reduc);
5286 : : }
5287 : :
5288 : 21394 : vectype = VECT_REDUC_INFO_VECTYPE (reduc_info);
5289 : 21394 : gcc_assert (vectype);
5290 : 21394 : mode = TYPE_MODE (vectype);
5291 : :
5292 : 21394 : tree induc_val = NULL_TREE;
5293 : 21394 : tree adjustment_def = NULL;
5294 : : /* Optimize: for induction condition reduction, if we can't use zero
5295 : : for induc_val, use initial_def. */
5296 : 21394 : if (VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5297 : 66 : induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
5298 : 21328 : else if (double_reduc)
5299 : : ;
5300 : : else
5301 : 21328 : adjustment_def = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info);
5302 : :
5303 : 21394 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
5304 : 21394 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5305 : 21394 : if (slp_reduc)
5306 : : /* All statements produce live-out values. */
5307 : 38640 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5308 : :
5309 : 21394 : unsigned vec_num
5310 : 21394 : = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5311 : :
5312 : : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5313 : : which is updated with the current index of the loop for every match of
5314 : : the original loop's cond_expr (VEC_STMT). This results in a vector
5315 : : containing the last time the condition passed for that vector lane.
5316 : : The first match will be a 1 to allow 0 to be used for non-matching
5317 : : indexes. If there are no matches at all then the vector will be all
5318 : : zeroes.
5319 : :
5320 : : PR92772: This algorithm is broken for architectures that support
5321 : : masked vectors, but do not provide fold_extract_last. */
5322 : 21394 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION)
5323 : : {
5324 : 71 : gcc_assert (!double_reduc);
5325 : 71 : auto_vec<std::pair<tree, bool>, 2> ccompares;
5326 : 71 : slp_tree cond_node = slp_node_instance->root;
5327 : 159 : while (cond_node != slp_node_instance->reduc_phis)
5328 : : {
5329 : 88 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
5330 : 88 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5331 : : {
5332 : 80 : gimple *vec_stmt
5333 : 80 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
5334 : 80 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5335 : 80 : ccompares.safe_push
5336 : 80 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
5337 : 80 : SLP_TREE_REDUC_IDX (cond_node) == 2));
5338 : : }
5339 : 88 : int slp_reduc_idx = SLP_TREE_REDUC_IDX (cond_node);
5340 : 88 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
5341 : : }
5342 : 71 : gcc_assert (ccompares.length () != 0);
5343 : :
5344 : 71 : tree indx_before_incr, indx_after_incr;
5345 : 71 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5346 : 71 : int scalar_precision
5347 : 71 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5348 : 71 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5349 : 71 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
5350 : 71 : (TYPE_MODE (vectype), cr_index_scalar_type,
5351 : : TYPE_VECTOR_SUBPARTS (vectype));
5352 : :
5353 : : /* First we create a simple vector induction variable which starts
5354 : : with the values {1,2,3,...} (SERIES_VECT) and increments by the
5355 : : vector size (STEP). */
5356 : :
5357 : : /* Create a {1,2,3,...} vector. */
5358 : 71 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5359 : :
5360 : : /* Create a vector of the step value. */
5361 : 71 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5362 : 71 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5363 : :
5364 : : /* Create an induction variable. */
5365 : 71 : gimple_stmt_iterator incr_gsi;
5366 : 71 : bool insert_after;
5367 : 71 : vect_iv_increment_position (LOOP_VINFO_IV_EXIT (loop_vinfo),
5368 : : &incr_gsi, &insert_after);
5369 : 71 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5370 : : insert_after, &indx_before_incr, &indx_after_incr);
5371 : :
5372 : : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5373 : : filled with zeros (VEC_ZERO). */
5374 : :
5375 : : /* Create a vector of 0s. */
5376 : 71 : tree zero = build_zero_cst (cr_index_scalar_type);
5377 : 71 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5378 : :
5379 : : /* Create a vector phi node. */
5380 : 71 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5381 : 71 : new_phi = create_phi_node (new_phi_tree, loop->header);
5382 : 71 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5383 : : loop_preheader_edge (loop), UNKNOWN_LOCATION);
5384 : :
5385 : : /* Now take the condition from the loops original cond_exprs
5386 : : and produce a new cond_exprs (INDEX_COND_EXPR) which for
5387 : : every match uses values from the induction variable
5388 : : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5389 : : (NEW_PHI_TREE).
5390 : : Finally, we update the phi (NEW_PHI_TREE) to take the value of
5391 : : the new cond_expr (INDEX_COND_EXPR). */
5392 : 71 : gimple_seq stmts = NULL;
5393 : 222 : for (int i = ccompares.length () - 1; i != -1; --i)
5394 : : {
5395 : 80 : tree ccompare = ccompares[i].first;
5396 : 80 : if (ccompares[i].second)
5397 : 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5398 : : cr_index_vector_type,
5399 : : ccompare,
5400 : : indx_before_incr, new_phi_tree);
5401 : : else
5402 : 11 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5403 : : cr_index_vector_type,
5404 : : ccompare,
5405 : : new_phi_tree, indx_before_incr);
5406 : : }
5407 : 71 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5408 : :
5409 : : /* Update the phi with the vec cond. */
5410 : 71 : induction_index = new_phi_tree;
5411 : 71 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5412 : : loop_latch_edge (loop), UNKNOWN_LOCATION);
5413 : 71 : }
5414 : :
5415 : : /* 2. Create epilog code.
5416 : : The reduction epilog code operates across the elements of the vector
5417 : : of partial results computed by the vectorized loop.
5418 : : The reduction epilog code consists of:
5419 : :
5420 : : step 1: compute the scalar result in a vector (v_out2)
5421 : : step 2: extract the scalar result (s_out3) from the vector (v_out2)
5422 : : step 3: adjust the scalar result (s_out3) if needed.
5423 : :
5424 : : Step 1 can be accomplished using one the following three schemes:
5425 : : (scheme 1) using reduc_fn, if available.
5426 : : (scheme 2) using whole-vector shifts, if available.
5427 : : (scheme 3) using a scalar loop. In this case steps 1+2 above are
5428 : : combined.
5429 : :
5430 : : The overall epilog code looks like this:
5431 : :
5432 : : s_out0 = phi <s_loop> # original EXIT_PHI
5433 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5434 : : v_out2 = reduce <v_out1> # step 1
5435 : : s_out3 = extract_field <v_out2, 0> # step 2
5436 : : s_out4 = adjust_result <s_out3> # step 3
5437 : :
5438 : : (step 3 is optional, and steps 1 and 2 may be combined).
5439 : : Lastly, the uses of s_out0 are replaced by s_out4. */
5440 : :
5441 : :
5442 : : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5443 : : v_out1 = phi <VECT_DEF>
5444 : : Store them in NEW_PHIS. */
5445 : : /* We need to reduce values in all exits. */
5446 : 21394 : exit_bb = loop_exit->dest;
5447 : 21394 : exit_gsi = gsi_after_labels (exit_bb);
5448 : 21394 : reduc_inputs.create (vec_num);
5449 : 46059 : for (unsigned i = 0; i < vec_num; i++)
5450 : : {
5451 : 24665 : gimple_seq stmts = NULL;
5452 : 24665 : def = vect_get_slp_vect_def (slp_node, i);
5453 : 24665 : tree new_def = copy_ssa_name (def);
5454 : 24665 : phi = create_phi_node (new_def, exit_bb);
5455 : 24665 : if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
5456 : 24638 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
5457 : : else
5458 : : {
5459 : 57 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
5460 : 30 : SET_PHI_ARG_DEF (phi, k, def);
5461 : : }
5462 : 24665 : new_def = gimple_convert (&stmts, vectype, new_def);
5463 : 24665 : reduc_inputs.quick_push (new_def);
5464 : 24665 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5465 : : }
5466 : :
5467 : : /* 2.2 Get the original scalar reduction variable as defined in the loop.
5468 : : In case STMT is a "pattern-stmt" (i.e. - it represents a reduction
5469 : : pattern), the scalar-def is taken from the original stmt that the
5470 : : pattern-stmt (STMT) replaces. */
5471 : :
5472 : 22027 : tree scalar_dest = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5473 : 21394 : tree scalar_type = TREE_TYPE (scalar_dest);
5474 : 21394 : scalar_results.truncate (0);
5475 : 21394 : scalar_results.reserve_exact (group_size);
5476 : 21394 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5477 : :
5478 : : /* True if we should implement SLP_REDUC using native reduction operations
5479 : : instead of scalar operations. */
5480 : 21394 : const bool direct_slp_reduc
5481 : 21394 : = (reduc_fn != IFN_LAST
5482 : 21394 : && slp_reduc
5483 : 21394 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5484 : :
5485 : : /* In case of reduction chain, e.g.,
5486 : : # a1 = phi <a3, a0>
5487 : : a2 = operation (a1)
5488 : : a3 = operation (a2),
5489 : :
5490 : : we may end up with more than one vector result. Here we reduce them
5491 : : to one vector.
5492 : :
5493 : : The same is true for a SLP reduction, e.g.,
5494 : : # a1 = phi <a2, a0>
5495 : : # b1 = phi <b2, b0>
5496 : : a2 = operation (a1)
5497 : : b2 = operation (a2),
5498 : :
5499 : : where we can end up with more than one vector as well. We can
5500 : : easily accumulate vectors when the number of vector elements is
5501 : : a multiple of the SLP group size.
5502 : :
5503 : : The same is true if we couldn't use a single defuse cycle. */
5504 : 21394 : if (!slp_reduc
5505 : : || direct_slp_reduc
5506 : 21394 : || (slp_reduc
5507 : 19320 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
5508 : : {
5509 : 21367 : gimple_seq stmts = NULL;
5510 : 21367 : tree single_input = reduc_inputs[0];
5511 : 24586 : for (k = 1; k < reduc_inputs.length (); k++)
5512 : 6438 : single_input = gimple_build (&stmts, code, vectype,
5513 : 3219 : single_input, reduc_inputs[k]);
5514 : 21367 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5515 : :
5516 : 21367 : reduc_inputs.truncate (0);
5517 : 21367 : reduc_inputs.safe_push (single_input);
5518 : : }
5519 : :
5520 : 21394 : tree orig_reduc_input = reduc_inputs[0];
5521 : :
5522 : : /* If this loop is an epilogue loop that can be skipped after the
5523 : : main loop, we can only share a reduction operation between the
5524 : : main loop and the epilogue if we put it at the target of the
5525 : : skip edge.
5526 : :
5527 : : We can still reuse accumulators if this check fails. Doing so has
5528 : : the minor(?) benefit of making the epilogue loop's scalar result
5529 : : independent of the main loop's scalar result. */
5530 : 21394 : bool unify_with_main_loop_p = false;
5531 : 21394 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
5532 : 3396 : && loop_vinfo->skip_this_loop_edge
5533 : 3196 : && single_succ_p (exit_bb)
5534 : 21411 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5535 : : {
5536 : 17 : unify_with_main_loop_p = true;
5537 : :
5538 : 17 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5539 : 17 : reduc_inputs[0] = make_ssa_name (vectype);
5540 : 17 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5541 : 17 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5542 : : UNKNOWN_LOCATION);
5543 : 17 : add_phi_arg (new_phi,
5544 : 17 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)->reduc_input,
5545 : : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5546 : 17 : exit_gsi = gsi_after_labels (reduc_block);
5547 : : }
5548 : :
5549 : : /* Shouldn't be used beyond this point. */
5550 : 21394 : exit_bb = nullptr;
5551 : :
5552 : 21394 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5553 : 71 : && reduc_fn != IFN_LAST)
5554 : : {
5555 : : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5556 : : various data values where the condition matched and another vector
5557 : : (INDUCTION_INDEX) containing all the indexes of those matches. We
5558 : : need to extract the last matching index (which will be the index with
5559 : : highest value) and use this to index into the data vector.
5560 : : For the case where there were no matches, the data vector will contain
5561 : : all default values and the index vector will be all zeros. */
5562 : :
5563 : : /* Get various versions of the type of the vector of indexes. */
5564 : 4 : tree index_vec_type = TREE_TYPE (induction_index);
5565 : 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5566 : 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
5567 : 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
5568 : :
5569 : : /* Get an unsigned integer version of the type of the data vector. */
5570 : 4 : int scalar_precision
5571 : 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5572 : 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5573 : 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5574 : : vectype);
5575 : :
5576 : : /* First we need to create a vector (ZERO_VEC) of zeros and another
5577 : : vector (MAX_INDEX_VEC) filled with the last matching index, which we
5578 : : can create using a MAX reduction and then expanding.
5579 : : In the case where the loop never made any matches, the max index will
5580 : : be zero. */
5581 : :
5582 : : /* Vector of {0, 0, 0,...}. */
5583 : 4 : tree zero_vec = build_zero_cst (vectype);
5584 : :
5585 : : /* Find maximum value from the vector of found indexes. */
5586 : 4 : tree max_index = make_ssa_name (index_scalar_type);
5587 : 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5588 : : 1, induction_index);
5589 : 4 : gimple_call_set_lhs (max_index_stmt, max_index);
5590 : 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5591 : :
5592 : : /* Vector of {max_index, max_index, max_index,...}. */
5593 : 4 : tree max_index_vec = make_ssa_name (index_vec_type);
5594 : 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5595 : : max_index);
5596 : 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5597 : : max_index_vec_rhs);
5598 : 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5599 : :
5600 : : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5601 : : with the vector (INDUCTION_INDEX) of found indexes, choosing values
5602 : : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5603 : : otherwise. Only one value should match, resulting in a vector
5604 : : (VEC_COND) with one data value and the rest zeros.
5605 : : In the case where the loop never made any matches, every index will
5606 : : match, resulting in a vector with all data values (which will all be
5607 : : the default value). */
5608 : :
5609 : : /* Compare the max index vector to the vector of found indexes to find
5610 : : the position of the max value. */
5611 : 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
5612 : 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5613 : : induction_index,
5614 : : max_index_vec);
5615 : 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5616 : :
5617 : : /* Use the compare to choose either values from the data vector or
5618 : : zero. */
5619 : 4 : tree vec_cond = make_ssa_name (vectype);
5620 : 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5621 : : vec_compare,
5622 : 4 : reduc_inputs[0],
5623 : : zero_vec);
5624 : 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5625 : :
5626 : : /* Finally we need to extract the data value from the vector (VEC_COND)
5627 : : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5628 : : reduction, but because this doesn't exist, we can use a MAX reduction
5629 : : instead. The data value might be signed or a float so we need to cast
5630 : : it first.
5631 : : In the case where the loop never made any matches, the data values are
5632 : : all identical, and so will reduce down correctly. */
5633 : :
5634 : : /* Make the matched data values unsigned. */
5635 : 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5636 : 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5637 : : vec_cond);
5638 : 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5639 : : VIEW_CONVERT_EXPR,
5640 : : vec_cond_cast_rhs);
5641 : 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5642 : :
5643 : : /* Reduce down to a scalar value. */
5644 : 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
5645 : 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5646 : : 1, vec_cond_cast);
5647 : 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5648 : 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5649 : :
5650 : : /* Convert the reduced value back to the result type and set as the
5651 : : result. */
5652 : 4 : gimple_seq stmts = NULL;
5653 : 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5654 : : data_reduc);
5655 : 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5656 : 4 : scalar_results.safe_push (new_temp);
5657 : 4 : }
5658 : 21390 : else if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5659 : 67 : && reduc_fn == IFN_LAST)
5660 : : {
5661 : : /* Condition reduction without supported IFN_REDUC_MAX. Generate
5662 : : idx = 0;
5663 : : idx_val = induction_index[0];
5664 : : val = data_reduc[0];
5665 : : for (idx = 0, val = init, i = 0; i < nelts; ++i)
5666 : : if (induction_index[i] > idx_val)
5667 : : val = data_reduc[i], idx_val = induction_index[i];
5668 : : return val; */
5669 : :
5670 : 67 : tree data_eltype = TREE_TYPE (vectype);
5671 : 67 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5672 : 67 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5673 : 67 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5674 : : /* Enforced by vectorizable_reduction, which ensures we have target
5675 : : support before allowing a conditional reduction on variable-length
5676 : : vectors. */
5677 : 67 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5678 : 67 : tree idx_val = NULL_TREE, val = NULL_TREE;
5679 : 447 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5680 : : {
5681 : 380 : tree old_idx_val = idx_val;
5682 : 380 : tree old_val = val;
5683 : 380 : idx_val = make_ssa_name (idx_eltype);
5684 : 380 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5685 : : build3 (BIT_FIELD_REF, idx_eltype,
5686 : : induction_index,
5687 : 380 : bitsize_int (el_size),
5688 : 380 : bitsize_int (off)));
5689 : 380 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5690 : 380 : val = make_ssa_name (data_eltype);
5691 : 760 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5692 : : build3 (BIT_FIELD_REF,
5693 : : data_eltype,
5694 : 380 : reduc_inputs[0],
5695 : 380 : bitsize_int (el_size),
5696 : 380 : bitsize_int (off)));
5697 : 380 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5698 : 380 : if (off != 0)
5699 : : {
5700 : 313 : tree new_idx_val = idx_val;
5701 : 313 : if (off != v_size - el_size)
5702 : : {
5703 : 246 : new_idx_val = make_ssa_name (idx_eltype);
5704 : 246 : epilog_stmt = gimple_build_assign (new_idx_val,
5705 : : MAX_EXPR, idx_val,
5706 : : old_idx_val);
5707 : 246 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5708 : : }
5709 : 313 : tree cond = make_ssa_name (boolean_type_node);
5710 : 313 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5711 : : idx_val, old_idx_val);
5712 : 313 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5713 : 313 : tree new_val = make_ssa_name (data_eltype);
5714 : 313 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5715 : : cond, val, old_val);
5716 : 313 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5717 : 313 : idx_val = new_idx_val;
5718 : 313 : val = new_val;
5719 : : }
5720 : : }
5721 : : /* Convert the reduced value back to the result type and set as the
5722 : : result. */
5723 : 67 : gimple_seq stmts = NULL;
5724 : 67 : val = gimple_convert (&stmts, scalar_type, val);
5725 : 67 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5726 : 67 : scalar_results.safe_push (val);
5727 : 67 : }
5728 : :
5729 : : /* 2.3 Create the reduction code, using one of the three schemes described
5730 : : above. In SLP we simply need to extract all the elements from the
5731 : : vector (without reducing them), so we use scalar shifts. */
5732 : 21323 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
5733 : : {
5734 : 19588 : tree tmp;
5735 : 19588 : tree vec_elem_type;
5736 : :
5737 : : /* Case 1: Create:
5738 : : v_out2 = reduc_expr <v_out1> */
5739 : :
5740 : 19588 : if (dump_enabled_p ())
5741 : 1337 : dump_printf_loc (MSG_NOTE, vect_location,
5742 : : "Reduce using direct vector reduction.\n");
5743 : :
5744 : 19588 : gimple_seq stmts = NULL;
5745 : 19588 : vec_elem_type = TREE_TYPE (vectype);
5746 : 19588 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5747 : 19588 : vec_elem_type, reduc_inputs[0]);
5748 : 19588 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5749 : 19588 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5750 : :
5751 : 19588 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5752 : 66 : && induc_val)
5753 : : {
5754 : : /* Earlier we set the initial value to be a vector if induc_val
5755 : : values. Check the result and if it is induc_val then replace
5756 : : with the original initial value, unless induc_val is
5757 : : the same as initial_def already. */
5758 : 63 : tree zcompare = make_ssa_name (boolean_type_node);
5759 : 63 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5760 : : new_temp, induc_val);
5761 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5762 : 63 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
5763 : 63 : tmp = make_ssa_name (new_scalar_dest);
5764 : 63 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5765 : : initial_def, new_temp);
5766 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5767 : 63 : new_temp = tmp;
5768 : : }
5769 : :
5770 : 19588 : scalar_results.safe_push (new_temp);
5771 : 19588 : }
5772 : 1581 : else if (direct_slp_reduc)
5773 : : {
5774 : : /* Here we create one vector for each of the GROUP_SIZE results,
5775 : : with the elements for other SLP statements replaced with the
5776 : : neutral value. We can then do a normal reduction on each vector. */
5777 : :
5778 : : /* Enforced by vectorizable_reduction. */
5779 : : gcc_assert (reduc_inputs.length () == 1);
5780 : : gcc_assert (pow2p_hwi (group_size));
5781 : :
5782 : : gimple_seq seq = NULL;
5783 : :
5784 : : /* Build a vector {0, 1, 2, ...}, with the same number of elements
5785 : : and the same element size as VECTYPE. */
5786 : : tree index = build_index_vector (vectype, 0, 1);
5787 : : tree index_type = TREE_TYPE (index);
5788 : : tree index_elt_type = TREE_TYPE (index_type);
5789 : : tree mask_type = truth_type_for (index_type);
5790 : :
5791 : : /* Create a vector that, for each element, identifies which of
5792 : : the results should use it. */
5793 : : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5794 : : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5795 : : build_vector_from_val (index_type, index_mask));
5796 : :
5797 : : /* Get a neutral vector value. This is simply a splat of the neutral
5798 : : scalar value if we have one, otherwise the initial scalar value
5799 : : is itself a neutral value. */
5800 : : tree vector_identity = NULL_TREE;
5801 : : tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5802 : : NULL_TREE, false);
5803 : : if (neutral_op)
5804 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5805 : : neutral_op);
5806 : : for (unsigned int i = 0; i < group_size; ++i)
5807 : : {
5808 : : /* If there's no univeral neutral value, we can use the
5809 : : initial scalar value from the original PHI. This is used
5810 : : for MIN and MAX reduction, for example. */
5811 : : if (!neutral_op)
5812 : : {
5813 : : tree scalar_value
5814 : : = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[i];
5815 : : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5816 : : scalar_value);
5817 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5818 : : scalar_value);
5819 : : }
5820 : :
5821 : : /* Calculate the equivalent of:
5822 : :
5823 : : sel[j] = (index[j] == i);
5824 : :
5825 : : which selects the elements of REDUC_INPUTS[0] that should
5826 : : be included in the result. */
5827 : : tree compare_val = build_int_cst (index_elt_type, i);
5828 : : compare_val = build_vector_from_val (index_type, compare_val);
5829 : : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5830 : : index, compare_val);
5831 : :
5832 : : /* Calculate the equivalent of:
5833 : :
5834 : : vec = seq ? reduc_inputs[0] : vector_identity;
5835 : :
5836 : : VEC is now suitable for a full vector reduction. */
5837 : : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5838 : : sel, reduc_inputs[0], vector_identity);
5839 : :
5840 : : /* Do the reduction and convert it to the appropriate type. */
5841 : : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5842 : : TREE_TYPE (vectype), vec);
5843 : : scalar = gimple_convert (&seq, scalar_type, scalar);
5844 : : scalar_results.safe_push (scalar);
5845 : : }
5846 : : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5847 : : }
5848 : : else
5849 : : {
5850 : 1581 : bool reduce_with_shift;
5851 : 1581 : tree vec_temp;
5852 : :
5853 : 1581 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5854 : :
5855 : : /* See if the target wants to do the final (shift) reduction
5856 : : in a vector mode of smaller size and first reduce upper/lower
5857 : : halves against each other. */
5858 : 1735 : enum machine_mode mode1 = mode;
5859 : 1735 : tree stype = TREE_TYPE (vectype);
5860 : 1735 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5861 : 1735 : unsigned nunits1 = nunits;
5862 : 1735 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5863 : 1735 : && reduc_inputs.length () == 1)
5864 : : {
5865 : 41 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5866 : : /* For SLP reductions we have to make sure lanes match up, but
5867 : : since we're doing individual element final reduction reducing
5868 : : vector width here is even more important.
5869 : : ??? We can also separate lanes with permutes, for the common
5870 : : case of power-of-two group-size odd/even extracts would work. */
5871 : 41 : if (slp_reduc && nunits != nunits1)
5872 : : {
5873 : 41 : nunits1 = least_common_multiple (nunits1, group_size);
5874 : 82 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5875 : : }
5876 : : }
5877 : 1735 : if (!slp_reduc
5878 : 1735 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5879 : 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5880 : :
5881 : 1735 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5882 : 1735 : stype, nunits1);
5883 : 1735 : reduce_with_shift = have_whole_vector_shift (mode1);
5884 : 715 : if (!VECTOR_MODE_P (mode1)
5885 : 2450 : || !directly_supported_p (code, vectype1))
5886 : : reduce_with_shift = false;
5887 : :
5888 : : /* First reduce the vector to the desired vector size we should
5889 : : do shift reduction on by combining upper and lower halves. */
5890 : 1735 : gimple_seq stmts = NULL;
5891 : 1735 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5892 : : code, &stmts);
5893 : 1735 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5894 : 1735 : reduc_inputs[0] = new_temp;
5895 : :
5896 : 1735 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
5897 : : {
5898 : 1540 : tree bitsize = TYPE_SIZE (TREE_TYPE (vectype1));
5899 : 1540 : int element_bitsize = tree_to_uhwi (bitsize);
5900 : : /* Enforced by vectorizable_reduction, which disallows SLP reductions
5901 : : for variable-length vectors and also requires direct target support
5902 : : for loop reductions. */
5903 : 1540 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5904 : 1540 : int nelements = vec_size_in_bits / element_bitsize;
5905 : 1540 : vec_perm_builder sel;
5906 : 1540 : vec_perm_indices indices;
5907 : :
5908 : 1540 : int elt_offset;
5909 : :
5910 : 1540 : tree zero_vec = build_zero_cst (vectype1);
5911 : : /* Case 2: Create:
5912 : : for (offset = nelements/2; offset >= 1; offset/=2)
5913 : : {
5914 : : Create: va' = vec_shift <va, offset>
5915 : : Create: va = vop <va, va'>
5916 : : } */
5917 : :
5918 : 1540 : if (dump_enabled_p ())
5919 : 320 : dump_printf_loc (MSG_NOTE, vect_location,
5920 : : "Reduce using vector shifts\n");
5921 : :
5922 : 1540 : gimple_seq stmts = NULL;
5923 : 1540 : new_temp = gimple_convert (&stmts, vectype1, new_temp);
5924 : 1540 : for (elt_offset = nelements / 2;
5925 : 3359 : elt_offset >= 1;
5926 : 1819 : elt_offset /= 2)
5927 : : {
5928 : 1819 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5929 : 1819 : indices.new_vector (sel, 2, nelements);
5930 : 1819 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
5931 : 1819 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5932 : : new_temp, zero_vec, mask);
5933 : 1819 : new_temp = gimple_build (&stmts, code,
5934 : : vectype1, new_name, new_temp);
5935 : : }
5936 : :
5937 : : /* 2.4 Extract the final scalar result. Create:
5938 : : s_out3 = extract_field <v_out2, bitpos> */
5939 : :
5940 : 1540 : if (dump_enabled_p ())
5941 : 320 : dump_printf_loc (MSG_NOTE, vect_location,
5942 : : "extract scalar result\n");
5943 : :
5944 : 1540 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
5945 : 1540 : new_temp, bitsize, bitsize_zero_node);
5946 : 1540 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5947 : : scalar_type, new_temp);
5948 : 1540 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5949 : 1540 : scalar_results.safe_push (new_temp);
5950 : 1540 : }
5951 : : else
5952 : : {
5953 : : /* Case 3: Create:
5954 : : s = extract_field <v_out2, 0>
5955 : : for (offset = element_size;
5956 : : offset < vector_size;
5957 : : offset += element_size;)
5958 : : {
5959 : : Create: s' = extract_field <v_out2, offset>
5960 : : Create: s = op <s, s'> // For non SLP cases
5961 : : } */
5962 : :
5963 : 195 : if (dump_enabled_p ())
5964 : 117 : dump_printf_loc (MSG_NOTE, vect_location,
5965 : : "Reduce using scalar code.\n");
5966 : :
5967 : 195 : tree compute_type = TREE_TYPE (vectype1);
5968 : 195 : tree bitsize = TYPE_SIZE (compute_type);
5969 : 195 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5970 : 195 : int element_bitsize = tree_to_uhwi (bitsize);
5971 : 195 : gimple_seq stmts = NULL;
5972 : 442 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5973 : : {
5974 : 247 : int bit_offset;
5975 : 494 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5976 : 247 : vec_temp, bitsize, bitsize_zero_node);
5977 : :
5978 : : /* In SLP we don't need to apply reduction operation, so we just
5979 : : collect s' values in SCALAR_RESULTS. */
5980 : 247 : if (slp_reduc)
5981 : 237 : scalar_results.safe_push (new_temp);
5982 : :
5983 : 521 : for (bit_offset = element_bitsize;
5984 : 768 : bit_offset < vec_size_in_bits;
5985 : 521 : bit_offset += element_bitsize)
5986 : : {
5987 : 521 : tree bitpos = bitsize_int (bit_offset);
5988 : 521 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
5989 : : compute_type, vec_temp,
5990 : : bitsize, bitpos);
5991 : 521 : if (slp_reduc)
5992 : : {
5993 : : /* In SLP we don't need to apply reduction operation, so
5994 : : we just collect s' values in SCALAR_RESULTS. */
5995 : 511 : new_temp = new_name;
5996 : 511 : scalar_results.safe_push (new_name);
5997 : : }
5998 : : else
5999 : 10 : new_temp = gimple_build (&stmts, code, compute_type,
6000 : : new_name, new_temp);
6001 : : }
6002 : : }
6003 : :
6004 : : /* The only case where we need to reduce scalar results in a SLP
6005 : : reduction, is unrolling. If the size of SCALAR_RESULTS is
6006 : : greater than GROUP_SIZE, we reduce them combining elements modulo
6007 : : GROUP_SIZE. */
6008 : 195 : if (slp_reduc)
6009 : : {
6010 : 185 : tree res, first_res, new_res;
6011 : :
6012 : : /* Reduce multiple scalar results in case of SLP unrolling. */
6013 : 438 : for (j = group_size; scalar_results.iterate (j, &res);
6014 : : j++)
6015 : : {
6016 : 253 : first_res = scalar_results[j % group_size];
6017 : 253 : new_res = gimple_build (&stmts, code, compute_type,
6018 : : first_res, res);
6019 : 253 : scalar_results[j % group_size] = new_res;
6020 : : }
6021 : 185 : scalar_results.truncate (group_size);
6022 : 865 : for (k = 0; k < group_size; k++)
6023 : 990 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6024 : 495 : scalar_results[k]);
6025 : : }
6026 : : else
6027 : : {
6028 : : /* Reduction chain - we have one scalar to keep in
6029 : : SCALAR_RESULTS. */
6030 : 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6031 : 10 : scalar_results.safe_push (new_temp);
6032 : : }
6033 : :
6034 : 195 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6035 : : }
6036 : :
6037 : 1735 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6038 : 0 : && induc_val)
6039 : : {
6040 : : /* Earlier we set the initial value to be a vector if induc_val
6041 : : values. Check the result and if it is induc_val then replace
6042 : : with the original initial value, unless induc_val is
6043 : : the same as initial_def already. */
6044 : 0 : tree zcompare = make_ssa_name (boolean_type_node);
6045 : 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6046 : 0 : scalar_results[0], induc_val);
6047 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6048 : 0 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
6049 : 0 : tree tmp = make_ssa_name (new_scalar_dest);
6050 : 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6051 : 0 : initial_def, scalar_results[0]);
6052 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6053 : 0 : scalar_results[0] = tmp;
6054 : : }
6055 : : }
6056 : :
6057 : : /* 2.5 Adjust the final result by the initial value of the reduction
6058 : : variable. (When such adjustment is not needed, then
6059 : : 'adjustment_def' is zero). For example, if code is PLUS we create:
6060 : : new_temp = loop_exit_def + adjustment_def */
6061 : :
6062 : 21394 : if (adjustment_def)
6063 : : {
6064 : 15679 : gcc_assert (!slp_reduc || group_size == 1);
6065 : 15679 : gimple_seq stmts = NULL;
6066 : 15679 : if (double_reduc)
6067 : : {
6068 : 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6069 : 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6070 : 0 : new_temp = gimple_build (&stmts, code, vectype,
6071 : 0 : reduc_inputs[0], adjustment_def);
6072 : : }
6073 : : else
6074 : : {
6075 : 15679 : new_temp = scalar_results[0];
6076 : 15679 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6077 : 15679 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6078 : : adjustment_def);
6079 : 15679 : new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6080 : 15679 : new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6081 : : new_temp, adjustment_def);
6082 : 15679 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6083 : : }
6084 : :
6085 : 15679 : epilog_stmt = gimple_seq_last_stmt (stmts);
6086 : 15679 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6087 : 15679 : scalar_results[0] = new_temp;
6088 : : }
6089 : :
6090 : : /* Record this operation if it could be reused by the epilogue loop. */
6091 : 21394 : if (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION
6092 : 21394 : && reduc_inputs.length () == 1)
6093 : 21221 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6094 : : { orig_reduc_input, reduc_info });
6095 : :
6096 : : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6097 : : phis with new adjusted scalar results, i.e., replace use <s_out0>
6098 : : with use <s_out4>.
6099 : :
6100 : : Transform:
6101 : : loop_exit:
6102 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6103 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6104 : : v_out2 = reduce <v_out1>
6105 : : s_out3 = extract_field <v_out2, 0>
6106 : : s_out4 = adjust_result <s_out3>
6107 : : use <s_out0>
6108 : : use <s_out0>
6109 : :
6110 : : into:
6111 : :
6112 : : loop_exit:
6113 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6114 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6115 : : v_out2 = reduce <v_out1>
6116 : : s_out3 = extract_field <v_out2, 0>
6117 : : s_out4 = adjust_result <s_out3>
6118 : : use <s_out4>
6119 : : use <s_out4> */
6120 : :
6121 : 42788 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6122 : 21394 : auto_vec<gimple *> phis;
6123 : 43098 : for (k = 0; k < live_out_stmts.size (); k++)
6124 : : {
6125 : 21704 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6126 : 21704 : tree scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6127 : :
6128 : : /* Find the loop-closed-use at the loop exit of the original scalar
6129 : : result. (The reduction result is expected to have two immediate uses,
6130 : : one at the latch block, and one at the loop exit). Note with
6131 : : early break we can have two exit blocks, so pick the correct PHI. */
6132 : 89316 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6133 : 67612 : if (!is_gimple_debug (USE_STMT (use_p))
6134 : 67612 : && !flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6135 : : {
6136 : 21699 : gcc_assert (is_a <gphi *> (USE_STMT (use_p)));
6137 : 21699 : if (gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6138 : 21691 : phis.safe_push (USE_STMT (use_p));
6139 : : }
6140 : :
6141 : 43395 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6142 : : {
6143 : : /* Replace the uses: */
6144 : 21691 : orig_name = PHI_RESULT (exit_phi);
6145 : :
6146 : : /* Look for a single use at the target of the skip edge. */
6147 : 21691 : if (unify_with_main_loop_p)
6148 : : {
6149 : 33 : use_operand_p use_p;
6150 : 33 : gimple *user;
6151 : 33 : if (!single_imm_use (orig_name, &use_p, &user))
6152 : 0 : gcc_unreachable ();
6153 : 33 : orig_name = gimple_get_lhs (user);
6154 : : }
6155 : :
6156 : 21691 : scalar_result = scalar_results[k];
6157 : 59105 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6158 : : {
6159 : 112286 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6160 : 37436 : SET_USE (use_p, scalar_result);
6161 : 37414 : update_stmt (use_stmt);
6162 : 21691 : }
6163 : : }
6164 : :
6165 : 21704 : phis.truncate (0);
6166 : : }
6167 : 21394 : }
6168 : :
6169 : : /* Return a vector of type VECTYPE that is equal to the vector select
6170 : : operation "MASK ? VEC : IDENTITY". Insert the select statements
6171 : : before GSI. */
6172 : :
6173 : : static tree
6174 : 0 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6175 : : tree vec, tree identity)
6176 : : {
6177 : 0 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6178 : 0 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6179 : : mask, vec, identity);
6180 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6181 : 0 : return cond;
6182 : : }
6183 : :
6184 : : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6185 : : order, starting with LHS. Insert the extraction statements before GSI and
6186 : : associate the new scalar SSA names with variable SCALAR_DEST.
6187 : : If MASK is nonzero mask the input and then operate on it unconditionally.
6188 : : Return the SSA name for the result. */
6189 : :
6190 : : static tree
6191 : 1025 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6192 : : tree_code code, tree lhs, tree vector_rhs,
6193 : : tree mask)
6194 : : {
6195 : 1025 : tree vectype = TREE_TYPE (vector_rhs);
6196 : 1025 : tree scalar_type = TREE_TYPE (vectype);
6197 : 1025 : tree bitsize = TYPE_SIZE (scalar_type);
6198 : 1025 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6199 : 1025 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6200 : :
6201 : : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6202 : : to perform an unconditional element-wise reduction of it. */
6203 : 1025 : if (mask)
6204 : : {
6205 : 11 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6206 : : "masked_vector_rhs");
6207 : 11 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6208 : : false);
6209 : 11 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
6210 : 11 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6211 : : mask, vector_rhs, vector_identity);
6212 : 11 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6213 : 11 : vector_rhs = masked_vector_rhs;
6214 : : }
6215 : :
6216 : 1025 : for (unsigned HOST_WIDE_INT bit_offset = 0;
6217 : 4417 : bit_offset < vec_size_in_bits;
6218 : 3392 : bit_offset += element_bitsize)
6219 : : {
6220 : 3392 : tree bitpos = bitsize_int (bit_offset);
6221 : 3392 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6222 : : bitsize, bitpos);
6223 : :
6224 : 3392 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6225 : 3392 : rhs = make_ssa_name (scalar_dest, stmt);
6226 : 3392 : gimple_assign_set_lhs (stmt, rhs);
6227 : 3392 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6228 : : /* Fold the vector extract, combining it with a previous reversal
6229 : : like seen in PR90579. */
6230 : 3392 : auto gsi2 = gsi_for_stmt (stmt);
6231 : 3392 : if (fold_stmt (&gsi2, follow_all_ssa_edges))
6232 : 356 : update_stmt (gsi_stmt (gsi2));
6233 : :
6234 : 3392 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6235 : 3392 : tree new_name = make_ssa_name (scalar_dest, stmt);
6236 : 3392 : gimple_assign_set_lhs (stmt, new_name);
6237 : 3392 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6238 : 3392 : lhs = new_name;
6239 : : }
6240 : 1025 : return lhs;
6241 : : }
6242 : :
6243 : : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6244 : : type of the vector input. */
6245 : :
6246 : : static internal_fn
6247 : 848 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6248 : : {
6249 : 848 : internal_fn mask_reduc_fn;
6250 : 848 : internal_fn mask_len_reduc_fn;
6251 : :
6252 : 848 : switch (reduc_fn)
6253 : : {
6254 : 0 : case IFN_FOLD_LEFT_PLUS:
6255 : 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6256 : 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6257 : 0 : break;
6258 : :
6259 : : default:
6260 : : return IFN_LAST;
6261 : : }
6262 : :
6263 : 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6264 : : OPTIMIZE_FOR_SPEED))
6265 : : return mask_reduc_fn;
6266 : 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6267 : : OPTIMIZE_FOR_SPEED))
6268 : : return mask_len_reduc_fn;
6269 : : return IFN_LAST;
6270 : : }
6271 : :
6272 : : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6273 : : statement that sets the live-out value. REDUC_DEF_STMT is the phi
6274 : : statement. CODE is the operation performed by STMT_INFO and OPS are
6275 : : its scalar operands. REDUC_INDEX is the index of the operand in
6276 : : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6277 : : implements in-order reduction, or IFN_LAST if we should open-code it.
6278 : : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6279 : : that should be used to control the operation in a fully-masked loop. */
6280 : :
6281 : : static bool
6282 : 840 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6283 : : stmt_vec_info stmt_info,
6284 : : gimple_stmt_iterator *gsi,
6285 : : slp_tree slp_node,
6286 : : code_helper code, internal_fn reduc_fn,
6287 : : int num_ops, tree vectype_in,
6288 : : int reduc_index, vec_loop_masks *masks,
6289 : : vec_loop_lens *lens)
6290 : : {
6291 : 840 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6292 : 840 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
6293 : 840 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6294 : :
6295 : 840 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6296 : :
6297 : 840 : bool is_cond_op = false;
6298 : 840 : if (!code.is_tree_code ())
6299 : : {
6300 : 9 : code = conditional_internal_fn_code (internal_fn (code));
6301 : 9 : gcc_assert (code != ERROR_MARK);
6302 : : is_cond_op = true;
6303 : : }
6304 : :
6305 : 840 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
6306 : :
6307 : 840 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6308 : : TYPE_VECTOR_SUBPARTS (vectype_in)));
6309 : :
6310 : : /* ??? We should, when transforming the cycle PHI, record the existing
6311 : : scalar def as vector def so looking up the vector def works. This
6312 : : would also allow generalizing this for reduction paths of length > 1
6313 : : and/or SLP reductions. */
6314 : 840 : slp_tree reduc_node = SLP_TREE_CHILDREN (slp_node)[reduc_index];
6315 : 840 : tree reduc_var = vect_get_slp_scalar_def (reduc_node, 0);
6316 : :
6317 : : /* The operands either come from a binary operation or an IFN_COND operation.
6318 : : The former is a gimple assign with binary rhs and the latter is a
6319 : : gimple call with four arguments. */
6320 : 840 : gcc_assert (num_ops == 2 || num_ops == 4);
6321 : :
6322 : 840 : int group_size = 1;
6323 : 840 : stmt_vec_info scalar_dest_def_info;
6324 : 840 : auto_vec<tree> vec_oprnds0, vec_opmask;
6325 : 840 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
6326 : 840 : + (1 - reduc_index)],
6327 : : &vec_oprnds0);
6328 : 840 : group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6329 : 840 : scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6330 : : /* For an IFN_COND_OP we also need the vector mask operand. */
6331 : 840 : if (is_cond_op)
6332 : 9 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
6333 : :
6334 : 840 : gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
6335 : 840 : tree scalar_dest = gimple_get_lhs (sdef);
6336 : 840 : tree scalar_type = TREE_TYPE (scalar_dest);
6337 : :
6338 : 840 : int vec_num = vec_oprnds0.length ();
6339 : 840 : tree vec_elem_type = TREE_TYPE (vectype_out);
6340 : 840 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6341 : :
6342 : 840 : tree vector_identity = NULL_TREE;
6343 : 840 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6344 : : {
6345 : 0 : vector_identity = build_zero_cst (vectype_out);
6346 : 0 : if (!HONOR_SIGNED_ZEROS (vectype_out))
6347 : : ;
6348 : : else
6349 : : {
6350 : 0 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6351 : 0 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6352 : : vector_identity);
6353 : : }
6354 : : }
6355 : :
6356 : 840 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6357 : 840 : int i;
6358 : 840 : tree def0;
6359 : 1865 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6360 : : {
6361 : 1025 : gimple *new_stmt;
6362 : 1025 : tree mask = NULL_TREE;
6363 : 1025 : tree len = NULL_TREE;
6364 : 1025 : tree bias = NULL_TREE;
6365 : 1025 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6366 : : {
6367 : 0 : tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
6368 : : vec_num, vectype_in, i);
6369 : 0 : if (is_cond_op)
6370 : 0 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
6371 : 0 : loop_mask, vec_opmask[i], gsi);
6372 : : else
6373 : : mask = loop_mask;
6374 : : }
6375 : 1025 : else if (is_cond_op)
6376 : 11 : mask = vec_opmask[i];
6377 : 1025 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6378 : : {
6379 : 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6380 : : i, 1);
6381 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6382 : 0 : bias = build_int_cst (intQI_type_node, biasval);
6383 : 0 : if (!is_cond_op)
6384 : 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
6385 : : }
6386 : :
6387 : : /* Handle MINUS by adding the negative. */
6388 : 1025 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6389 : : {
6390 : 0 : tree negated = make_ssa_name (vectype_out);
6391 : 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6392 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6393 : 0 : def0 = negated;
6394 : : }
6395 : :
6396 : 0 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
6397 : 1025 : && mask && mask_reduc_fn == IFN_LAST)
6398 : 0 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6399 : : vector_identity);
6400 : :
6401 : : /* On the first iteration the input is simply the scalar phi
6402 : : result, and for subsequent iterations it is the output of
6403 : : the preceding operation. */
6404 : 1025 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6405 : : {
6406 : 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6407 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6408 : : def0, mask, len, bias);
6409 : 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6410 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6411 : : def0, mask);
6412 : : else
6413 : 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6414 : : def0);
6415 : : /* For chained SLP reductions the output of the previous reduction
6416 : : operation serves as the input of the next. For the final statement
6417 : : the output cannot be a temporary - we reuse the original
6418 : : scalar destination of the last statement. */
6419 : 0 : if (i != vec_num - 1)
6420 : : {
6421 : 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
6422 : 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6423 : 0 : gimple_set_lhs (new_stmt, reduc_var);
6424 : : }
6425 : : }
6426 : : else
6427 : : {
6428 : 1025 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
6429 : : tree_code (code), reduc_var, def0,
6430 : : mask);
6431 : 1025 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6432 : : /* Remove the statement, so that we can use the same code paths
6433 : : as for statements that we've just created. */
6434 : 1025 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6435 : 1025 : gsi_remove (&tmp_gsi, true);
6436 : : }
6437 : :
6438 : 1025 : if (i == vec_num - 1)
6439 : : {
6440 : 840 : gimple_set_lhs (new_stmt, scalar_dest);
6441 : 840 : vect_finish_replace_stmt (loop_vinfo,
6442 : : scalar_dest_def_info,
6443 : : new_stmt);
6444 : : }
6445 : : else
6446 : 185 : vect_finish_stmt_generation (loop_vinfo,
6447 : : scalar_dest_def_info,
6448 : : new_stmt, gsi);
6449 : :
6450 : 1025 : slp_node->push_vec_def (new_stmt);
6451 : : }
6452 : :
6453 : 840 : return true;
6454 : 840 : }
6455 : :
6456 : : /* Function is_nonwrapping_integer_induction.
6457 : :
6458 : : Check if STMT_VINO (which is part of loop LOOP) both increments and
6459 : : does not cause overflow. */
6460 : :
6461 : : static bool
6462 : 377 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6463 : : {
6464 : 377 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6465 : 377 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6466 : 377 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6467 : 377 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6468 : 377 : widest_int ni, max_loop_value, lhs_max;
6469 : 377 : wi::overflow_type overflow = wi::OVF_NONE;
6470 : :
6471 : : /* Make sure the loop is integer based. */
6472 : 377 : if (TREE_CODE (base) != INTEGER_CST
6473 : 112 : || TREE_CODE (step) != INTEGER_CST)
6474 : : return false;
6475 : :
6476 : : /* Check that the max size of the loop will not wrap. */
6477 : :
6478 : 112 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6479 : : return true;
6480 : :
6481 : 8 : if (! max_stmt_executions (loop, &ni))
6482 : : return false;
6483 : :
6484 : 8 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6485 : 8 : &overflow);
6486 : 8 : if (overflow)
6487 : : return false;
6488 : :
6489 : 8 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6490 : 16 : TYPE_SIGN (lhs_type), &overflow);
6491 : 8 : if (overflow)
6492 : : return false;
6493 : :
6494 : 8 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6495 : 8 : <= TYPE_PRECISION (lhs_type));
6496 : 377 : }
6497 : :
6498 : : /* Check if masking can be supported by inserting a conditional expression.
6499 : : CODE is the code for the operation. COND_FN is the conditional internal
6500 : : function, if it exists. VECTYPE_IN is the type of the vector input. */
6501 : : static bool
6502 : 2368 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6503 : : tree vectype_in)
6504 : : {
6505 : 2368 : if (cond_fn != IFN_LAST
6506 : 2368 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
6507 : : OPTIMIZE_FOR_SPEED))
6508 : : return false;
6509 : :
6510 : 2309 : if (code.is_tree_code ())
6511 : 2013 : switch (tree_code (code))
6512 : : {
6513 : : case DOT_PROD_EXPR:
6514 : : case SAD_EXPR:
6515 : : return true;
6516 : :
6517 : : default:
6518 : : break;
6519 : : }
6520 : : return false;
6521 : : }
6522 : :
6523 : : /* Insert a conditional expression to enable masked vectorization. CODE is the
6524 : : code for the operation. VOP is the array of operands. MASK is the loop
6525 : : mask. GSI is a statement iterator used to place the new conditional
6526 : : expression. */
6527 : : static void
6528 : 4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6529 : : gimple_stmt_iterator *gsi)
6530 : : {
6531 : 4 : switch (tree_code (code))
6532 : : {
6533 : 4 : case DOT_PROD_EXPR:
6534 : 4 : {
6535 : 4 : tree vectype = TREE_TYPE (vop[1]);
6536 : 4 : tree zero = build_zero_cst (vectype);
6537 : 4 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6538 : 4 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6539 : : mask, vop[1], zero);
6540 : 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6541 : 4 : vop[1] = masked_op1;
6542 : 4 : break;
6543 : : }
6544 : :
6545 : 0 : case SAD_EXPR:
6546 : 0 : {
6547 : 0 : tree vectype = TREE_TYPE (vop[1]);
6548 : 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6549 : 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6550 : : mask, vop[1], vop[0]);
6551 : 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6552 : 0 : vop[1] = masked_op1;
6553 : 0 : break;
6554 : : }
6555 : :
6556 : 0 : default:
6557 : 0 : gcc_unreachable ();
6558 : : }
6559 : 4 : }
6560 : :
6561 : : /* Given an operation with CODE in loop reduction path whose reduction PHI is
6562 : : specified by REDUC_INFO, the operation has TYPE of scalar result, and its
6563 : : input vectype is represented by VECTYPE_IN. The vectype of vectorized result
6564 : : may be different from VECTYPE_IN, either in base type or vectype lanes,
6565 : : lane-reducing operation is the case. This function check if it is possible,
6566 : : and how to perform partial vectorization on the operation in the context
6567 : : of LOOP_VINFO. */
6568 : :
6569 : : static void
6570 : 8 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
6571 : : vect_reduc_info reduc_info,
6572 : : slp_tree slp_node,
6573 : : code_helper code, tree type,
6574 : : tree vectype_in)
6575 : : {
6576 : 8 : enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
6577 : 8 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
6578 : 8 : internal_fn cond_fn = get_conditional_internal_fn (code, type);
6579 : :
6580 : 8 : if (reduc_type != FOLD_LEFT_REDUCTION
6581 : 8 : && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6582 : 12 : && (cond_fn == IFN_LAST
6583 : 4 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6584 : : OPTIMIZE_FOR_SPEED)))
6585 : : {
6586 : 0 : if (dump_enabled_p ())
6587 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6588 : : "can't operate on partial vectors because"
6589 : : " no conditional operation is available.\n");
6590 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6591 : : }
6592 : 8 : else if (reduc_type == FOLD_LEFT_REDUCTION
6593 : 8 : && reduc_fn == IFN_LAST
6594 : 8 : && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
6595 : : {
6596 : 0 : if (dump_enabled_p ())
6597 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6598 : : "can't operate on partial vectors because"
6599 : : " no conditional operation is available.\n");
6600 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6601 : : }
6602 : 8 : else if (reduc_type == FOLD_LEFT_REDUCTION
6603 : 0 : && internal_fn_mask_index (reduc_fn) == -1
6604 : 0 : && FLOAT_TYPE_P (vectype_in)
6605 : 8 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
6606 : : {
6607 : 0 : if (dump_enabled_p ())
6608 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6609 : : "can't operate on partial vectors because"
6610 : : " signed zeros cannot be preserved.\n");
6611 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6612 : : }
6613 : : else
6614 : : {
6615 : 8 : internal_fn mask_reduc_fn
6616 : 8 : = get_masked_reduction_fn (reduc_fn, vectype_in);
6617 : 8 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6618 : 8 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
6619 : 8 : unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node);
6620 : :
6621 : 8 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6622 : 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
6623 : : else
6624 : 8 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
6625 : : }
6626 : 8 : }
6627 : :
6628 : : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
6629 : : the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
6630 : : and the analysis is for slp if SLP_NODE is not NULL.
6631 : :
6632 : : For a lane-reducing operation, the loop reduction path that it lies in,
6633 : : may contain normal operation, or other lane-reducing operation of different
6634 : : input type size, an example as:
6635 : :
6636 : : int sum = 0;
6637 : : for (i)
6638 : : {
6639 : : ...
6640 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
6641 : : sum += w[i]; // widen-sum <vector(16) char>
6642 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
6643 : : sum += n[i]; // normal <vector(4) int>
6644 : : ...
6645 : : }
6646 : :
6647 : : Vectorization factor is essentially determined by operation whose input
6648 : : vectype has the most lanes ("vector(16) char" in the example), while we
6649 : : need to choose input vectype with the least lanes ("vector(4) int" in the
6650 : : example) to determine effective number of vector reduction PHIs. */
6651 : :
6652 : : bool
6653 : 301943 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
6654 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
6655 : : {
6656 : 301943 : gimple *stmt = stmt_info->stmt;
6657 : :
6658 : 301943 : if (!lane_reducing_stmt_p (stmt))
6659 : : return false;
6660 : :
6661 : 441 : tree type = TREE_TYPE (gimple_assign_lhs (stmt));
6662 : :
6663 : 441 : if (!INTEGRAL_TYPE_P (type))
6664 : : return false;
6665 : :
6666 : : /* Do not try to vectorize bit-precision reductions. */
6667 : 441 : if (!type_has_mode_precision_p (type))
6668 : : return false;
6669 : :
6670 : 441 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6671 : :
6672 : : /* TODO: Support lane-reducing operation that does not directly participate
6673 : : in loop reduction. */
6674 : 441 : if (!reduc_info)
6675 : : return false;
6676 : :
6677 : : /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
6678 : : recoginized. */
6679 : 441 : gcc_assert (!nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo), stmt_info));
6680 : 441 : gcc_assert (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION);
6681 : :
6682 : 1764 : for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
6683 : : {
6684 : 1323 : slp_tree slp_op;
6685 : 1323 : tree op;
6686 : 1323 : tree vectype;
6687 : 1323 : enum vect_def_type dt;
6688 : :
6689 : 1323 : if (!vect_is_simple_use (loop_vinfo, slp_node, i, &op,
6690 : : &slp_op, &dt, &vectype))
6691 : : {
6692 : 0 : if (dump_enabled_p ())
6693 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6694 : : "use not simple.\n");
6695 : 0 : return false;
6696 : : }
6697 : :
6698 : 1323 : if (!vectype)
6699 : : {
6700 : 6 : vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
6701 : : slp_op);
6702 : 6 : if (!vectype)
6703 : : return false;
6704 : : }
6705 : :
6706 : 1323 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
6707 : : {
6708 : 0 : if (dump_enabled_p ())
6709 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6710 : : "incompatible vector types for invariants\n");
6711 : 0 : return false;
6712 : : }
6713 : :
6714 : 1323 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6715 : 441 : continue;
6716 : :
6717 : : /* There should be at most one cycle def in the stmt. */
6718 : 882 : if (VECTORIZABLE_CYCLE_DEF (dt))
6719 : : return false;
6720 : : }
6721 : :
6722 : 441 : slp_tree node_in = SLP_TREE_CHILDREN (slp_node)[0];
6723 : 441 : tree vectype_in = SLP_TREE_VECTYPE (node_in);
6724 : 441 : gcc_assert (vectype_in);
6725 : :
6726 : : /* Compute number of effective vector statements for costing. */
6727 : 441 : unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, node_in);
6728 : 441 : gcc_assert (ncopies_for_cost >= 1);
6729 : :
6730 : 441 : if (vect_is_emulated_mixed_dot_prod (slp_node))
6731 : : {
6732 : : /* We need extra two invariants: one that contains the minimum signed
6733 : : value and one that contains half of its negative. */
6734 : 8 : int prologue_stmts = 2;
6735 : 8 : unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
6736 : : scalar_to_vec, slp_node, 0,
6737 : : vect_prologue);
6738 : 8 : if (dump_enabled_p ())
6739 : 0 : dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
6740 : : "extra prologue_cost = %d .\n", cost);
6741 : :
6742 : : /* Three dot-products and a subtraction. */
6743 : 8 : ncopies_for_cost *= 4;
6744 : : }
6745 : :
6746 : 441 : record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, slp_node,
6747 : : 0, vect_body);
6748 : :
6749 : 441 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
6750 : : {
6751 : 4 : enum tree_code code = gimple_assign_rhs_code (stmt);
6752 : 4 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
6753 : 4 : node_in, code, type,
6754 : : vectype_in);
6755 : : }
6756 : :
6757 : : /* Transform via vect_transform_reduction. */
6758 : 441 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6759 : 441 : return true;
6760 : : }
6761 : :
6762 : : /* Function vectorizable_reduction.
6763 : :
6764 : : Check if STMT_INFO performs a reduction operation that can be vectorized.
6765 : : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6766 : : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6767 : : Return true if STMT_INFO is vectorizable in this way.
6768 : :
6769 : : This function also handles reduction idioms (patterns) that have been
6770 : : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6771 : : may be of this form:
6772 : : X = pattern_expr (arg0, arg1, ..., X)
6773 : : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6774 : : sequence that had been detected and replaced by the pattern-stmt
6775 : : (STMT_INFO).
6776 : :
6777 : : This function also handles reduction of condition expressions, for example:
6778 : : for (int i = 0; i < N; i++)
6779 : : if (a[i] < value)
6780 : : last = a[i];
6781 : : This is handled by vectorising the loop and creating an additional vector
6782 : : containing the loop indexes for which "a[i] < value" was true. In the
6783 : : function epilogue this is reduced to a single max value and then used to
6784 : : index into the vector of results.
6785 : :
6786 : : In some cases of reduction patterns, the type of the reduction variable X is
6787 : : different than the type of the other arguments of STMT_INFO.
6788 : : In such cases, the vectype that is used when transforming STMT_INFO into
6789 : : a vector stmt is different than the vectype that is used to determine the
6790 : : vectorization factor, because it consists of a different number of elements
6791 : : than the actual number of elements that are being operated upon in parallel.
6792 : :
6793 : : For example, consider an accumulation of shorts into an int accumulator.
6794 : : On some targets it's possible to vectorize this pattern operating on 8
6795 : : shorts at a time (hence, the vectype for purposes of determining the
6796 : : vectorization factor should be V8HI); on the other hand, the vectype that
6797 : : is used to create the vector form is actually V4SI (the type of the result).
6798 : :
6799 : : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6800 : : indicates what is the actual level of parallelism (V8HI in the example), so
6801 : : that the right vectorization factor would be derived. This vectype
6802 : : corresponds to the type of arguments to the reduction stmt, and should *NOT*
6803 : : be used to create the vectorized stmt. The right vectype for the vectorized
6804 : : stmt is obtained from the type of the result X:
6805 : : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6806 : :
6807 : : This means that, contrary to "regular" reductions (or "regular" stmts in
6808 : : general), the following equation:
6809 : : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6810 : : does *NOT* necessarily hold for reduction patterns. */
6811 : :
6812 : : bool
6813 : 301502 : vectorizable_reduction (loop_vec_info loop_vinfo,
6814 : : stmt_vec_info stmt_info, slp_tree slp_node,
6815 : : slp_instance slp_node_instance,
6816 : : stmt_vector_for_cost *cost_vec)
6817 : : {
6818 : 301502 : tree vectype_in = NULL_TREE;
6819 : 301502 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6820 : 301502 : stmt_vec_info cond_stmt_vinfo = NULL;
6821 : 301502 : int i;
6822 : 301502 : int ncopies;
6823 : 301502 : bool single_defuse_cycle = false;
6824 : 301502 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6825 : 301502 : tree cond_reduc_val = NULL_TREE;
6826 : :
6827 : : /* Make sure it was already recognized as a reduction computation. */
6828 : 301502 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6829 : : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6830 : 301502 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6831 : : return false;
6832 : :
6833 : : /* The reduction meta. */
6834 : 54504 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6835 : :
6836 : 54504 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6837 : : {
6838 : 1304 : gcc_assert (is_a <gphi *> (stmt_info->stmt));
6839 : : /* We eventually need to set a vector type on invariant arguments. */
6840 : : unsigned j;
6841 : : slp_tree child;
6842 : 3912 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6843 : 2608 : if (!vect_maybe_update_slp_op_vectype (child,
6844 : : SLP_TREE_VECTYPE (slp_node)))
6845 : : {
6846 : 0 : if (dump_enabled_p ())
6847 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6848 : : "incompatible vector types for "
6849 : : "invariants\n");
6850 : 0 : return false;
6851 : : }
6852 : : /* Analysis for double-reduction is done on the outer
6853 : : loop PHI, nested cycles have no further restrictions. */
6854 : 1304 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
6855 : 1304 : return true;
6856 : : }
6857 : :
6858 : 53200 : if (!is_a <gphi *> (stmt_info->stmt))
6859 : : {
6860 : 6878 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def);
6861 : 6878 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6862 : 6878 : return true;
6863 : : }
6864 : :
6865 : 46322 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6866 : 46322 : stmt_vec_info phi_info = stmt_info;
6867 : 46322 : bool double_reduc = false;
6868 : 46322 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6869 : : {
6870 : : /* We arrive here for both the inner loop LC PHI and the
6871 : : outer loop PHI. The latter is what we want to analyze the
6872 : : reduction with. The LC PHI is handled by vectorizable_lc_phi. */
6873 : 279 : if (gimple_bb (stmt_info->stmt) != loop->header)
6874 : 0 : return false;
6875 : :
6876 : : /* Set loop and phi_info to the inner loop. */
6877 : 279 : use_operand_p use_p;
6878 : 279 : gimple *use_stmt;
6879 : 279 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6880 : : &use_p, &use_stmt);
6881 : 279 : gcc_assert (res);
6882 : 279 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
6883 : 279 : loop = loop->inner;
6884 : 279 : double_reduc = true;
6885 : : }
6886 : :
6887 : 46322 : const bool reduc_chain = reduc_info->is_reduc_chain;
6888 : 46322 : slp_node_instance->reduc_phis = slp_node;
6889 : : /* ??? We're leaving slp_node to point to the PHIs, we only
6890 : : need it to get at the number of vector stmts which wasn't
6891 : : yet initialized for the instance root. */
6892 : :
6893 : : /* PHIs should not participate in patterns. */
6894 : 46322 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6895 : 46322 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6896 : :
6897 : : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6898 : : and compute the reduction chain length. Discover the real
6899 : : reduction operation stmt on the way (slp_for_stmt_info). */
6900 : 46322 : unsigned reduc_chain_length = 0;
6901 : 46322 : stmt_info = NULL;
6902 : 46322 : slp_tree slp_for_stmt_info = NULL;
6903 : 46322 : slp_tree vdef_slp = slp_node_instance->root;
6904 : 100196 : while (vdef_slp != slp_node)
6905 : : {
6906 : 54955 : int reduc_idx = SLP_TREE_REDUC_IDX (vdef_slp);
6907 : 54955 : if (reduc_idx == -1)
6908 : : {
6909 : 398 : if (dump_enabled_p ())
6910 : 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6911 : : "reduction chain broken by patterns.\n");
6912 : 1081 : return false;
6913 : : }
6914 : 54557 : stmt_vec_info vdef = SLP_TREE_REPRESENTATIVE (vdef_slp);
6915 : 54557 : if (is_a <gphi *> (vdef->stmt))
6916 : : {
6917 : 550 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
6918 : : /* Do not count PHIs towards the chain length. */
6919 : 550 : continue;
6920 : : }
6921 : 54007 : gimple_match_op op;
6922 : 54007 : if (!gimple_extract_op (vdef->stmt, &op))
6923 : : {
6924 : 0 : if (dump_enabled_p ())
6925 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6926 : : "reduction chain includes unsupported"
6927 : : " statement type.\n");
6928 : 0 : return false;
6929 : : }
6930 : 54007 : if (CONVERT_EXPR_CODE_P (op.code))
6931 : : {
6932 : 3923 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6933 : : {
6934 : 683 : if (dump_enabled_p ())
6935 : 52 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6936 : : "conversion in the reduction chain.\n");
6937 : 683 : return false;
6938 : : }
6939 : 3240 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[0];
6940 : : }
6941 : : else
6942 : : {
6943 : : /* First non-conversion stmt. */
6944 : 50084 : if (!slp_for_stmt_info)
6945 : 45924 : slp_for_stmt_info = vdef_slp;
6946 : :
6947 : 50084 : if (lane_reducing_op_p (op.code))
6948 : : {
6949 : : /* The last operand of lane-reducing operation is for
6950 : : reduction. */
6951 : 441 : gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
6952 : :
6953 : 441 : slp_tree op_node = SLP_TREE_CHILDREN (vdef_slp)[0];
6954 : 441 : tree vectype_op = SLP_TREE_VECTYPE (op_node);
6955 : 441 : tree type_op = TREE_TYPE (op.ops[0]);
6956 : 441 : if (!vectype_op)
6957 : : {
6958 : 9 : vectype_op = get_vectype_for_scalar_type (loop_vinfo,
6959 : : type_op);
6960 : 9 : if (!vectype_op
6961 : 9 : || !vect_maybe_update_slp_op_vectype (op_node,
6962 : : vectype_op))
6963 : 0 : return false;
6964 : : }
6965 : :
6966 : : /* To accommodate lane-reducing operations of mixed input
6967 : : vectypes, choose input vectype with the least lanes for the
6968 : : reduction PHI statement, which would result in the most
6969 : : ncopies for vectorized reduction results. */
6970 : 441 : if (!vectype_in
6971 : 441 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6972 : 46 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
6973 : 418 : vectype_in = vectype_op;
6974 : : }
6975 : 49643 : else if (!vectype_in)
6976 : 45506 : vectype_in = SLP_TREE_VECTYPE (slp_node);
6977 : 50084 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
6978 : : }
6979 : 53324 : reduc_chain_length++;
6980 : : }
6981 : 45241 : stmt_info = SLP_TREE_REPRESENTATIVE (slp_for_stmt_info);
6982 : :
6983 : : /* PHIs should not participate in patterns. */
6984 : 45241 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6985 : :
6986 : : /* 1. Is vectorizable reduction? */
6987 : : /* Not supportable if the reduction variable is used in the loop, unless
6988 : : it's a reduction chain. */
6989 : 45241 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6990 : 0 : && !reduc_chain)
6991 : : return false;
6992 : :
6993 : : /* Reductions that are not used even in an enclosing outer-loop,
6994 : : are expected to be "live" (used out of the loop). */
6995 : 45241 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6996 : 0 : && !STMT_VINFO_LIVE_P (stmt_info))
6997 : : return false;
6998 : :
6999 : : /* 2. Has this been recognized as a reduction pattern?
7000 : :
7001 : : Check if STMT represents a pattern that has been recognized
7002 : : in earlier analysis stages. For stmts that represent a pattern,
7003 : : the STMT_VINFO_RELATED_STMT field records the last stmt in
7004 : : the original sequence that constitutes the pattern. */
7005 : :
7006 : 45241 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7007 : 45241 : if (orig_stmt_info)
7008 : : {
7009 : 3268 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7010 : 3268 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7011 : : }
7012 : :
7013 : : /* 3. Check the operands of the operation. The first operands are defined
7014 : : inside the loop body. The last operand is the reduction variable,
7015 : : which is defined by the loop-header-phi. */
7016 : :
7017 : 45241 : tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
7018 : 45241 : VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
7019 : :
7020 : : /* We do not handle mask reductions correctly in the epilogue. */
7021 : 45241 : if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
7022 : : {
7023 : 524 : if (dump_enabled_p ())
7024 : 24 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7025 : : "mask reduction not supported.\n");
7026 : 524 : return false;
7027 : : }
7028 : :
7029 : 44717 : gimple_match_op op;
7030 : 44717 : if (!gimple_extract_op (stmt_info->stmt, &op))
7031 : 0 : gcc_unreachable ();
7032 : 44717 : bool lane_reducing = lane_reducing_op_p (op.code);
7033 : :
7034 : 44717 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7035 : 13467 : && !SCALAR_FLOAT_TYPE_P (op.type))
7036 : : return false;
7037 : :
7038 : : /* Do not try to vectorize bit-precision reductions. */
7039 : 44717 : if (!type_has_mode_precision_p (op.type)
7040 : 381 : && op.code != BIT_AND_EXPR
7041 : 373 : && op.code != BIT_IOR_EXPR
7042 : 44775 : && op.code != BIT_XOR_EXPR)
7043 : : return false;
7044 : :
7045 : : /* Lane-reducing ops also never can be used in a SLP reduction group
7046 : : since we'll mix lanes belonging to different reductions. But it's
7047 : : OK to use them in a reduction chain or when the reduction group
7048 : : has just one element. */
7049 : 44659 : if (lane_reducing
7050 : 44659 : && !reduc_chain
7051 : 391 : && SLP_TREE_LANES (slp_node) > 1)
7052 : : {
7053 : 0 : if (dump_enabled_p ())
7054 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7055 : : "lane-reducing reduction in reduction group.\n");
7056 : 0 : return false;
7057 : : }
7058 : :
7059 : : /* All uses but the last are expected to be defined in the loop.
7060 : : The last use is the reduction variable. In case of nested cycle this
7061 : : assumption is not true: we use reduc_index to record the index of the
7062 : : reduction variable. */
7063 : 44659 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7064 : 44659 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7065 : : /* We need to skip an extra operand for COND_EXPRs with embedded
7066 : : comparison. */
7067 : 44659 : unsigned opno_adjust = 0;
7068 : 44659 : if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7069 : 44659 : opno_adjust = 1;
7070 : 141558 : for (i = 0; i < (int) op.num_ops; i++)
7071 : : {
7072 : : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7073 : 96907 : if (i == 0 && op.code == COND_EXPR)
7074 : 48614 : continue;
7075 : :
7076 : 96160 : stmt_vec_info def_stmt_info;
7077 : 96160 : enum vect_def_type dt;
7078 : 96160 : if (!vect_is_simple_use (loop_vinfo, slp_for_stmt_info,
7079 : 96160 : i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7080 : 96160 : &vectype_op[i], &def_stmt_info))
7081 : : {
7082 : 0 : if (dump_enabled_p ())
7083 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7084 : : "use not simple.\n");
7085 : 8 : return false;
7086 : : }
7087 : :
7088 : : /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7089 : : reduction operand twice (once as definition, once as else). */
7090 : 96160 : if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7091 : 47867 : continue;
7092 : :
7093 : : /* There should be only one cycle def in the stmt, the one
7094 : : leading to reduc_def. */
7095 : 48293 : if (VECTORIZABLE_CYCLE_DEF (dt))
7096 : : return false;
7097 : :
7098 : 48285 : if (!vectype_op[i])
7099 : 4114 : vectype_op[i]
7100 : 4114 : = get_vectype_for_scalar_type (loop_vinfo,
7101 : 4114 : TREE_TYPE (op.ops[i]), slp_op[i]);
7102 : :
7103 : : /* Record how the non-reduction-def value of COND_EXPR is defined.
7104 : : ??? For a chain of multiple CONDs we'd have to match them up all. */
7105 : 48285 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7106 : : {
7107 : 720 : if (dt == vect_constant_def)
7108 : : {
7109 : 105 : cond_reduc_dt = dt;
7110 : 105 : cond_reduc_val = op.ops[i];
7111 : : }
7112 : 615 : else if (dt == vect_induction_def
7113 : 377 : && def_stmt_info
7114 : 992 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7115 : : {
7116 : 112 : cond_reduc_dt = dt;
7117 : 112 : cond_stmt_vinfo = def_stmt_info;
7118 : : }
7119 : : }
7120 : : }
7121 : :
7122 : 44651 : enum vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7123 : : /* If we have a condition reduction, see if we can simplify it further. */
7124 : 44651 : if (reduction_type == COND_REDUCTION)
7125 : : {
7126 : 735 : if (SLP_TREE_LANES (slp_node) != 1)
7127 : : return false;
7128 : :
7129 : : /* When the condition uses the reduction value in the condition, fail. */
7130 : 729 : if (SLP_TREE_REDUC_IDX (slp_node) == 0)
7131 : : {
7132 : 0 : if (dump_enabled_p ())
7133 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7134 : : "condition depends on previous iteration\n");
7135 : 0 : return false;
7136 : : }
7137 : :
7138 : 729 : if (reduc_chain_length == 1
7139 : 729 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7140 : : OPTIMIZE_FOR_SPEED)
7141 : 702 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7142 : : vectype_in,
7143 : : OPTIMIZE_FOR_SPEED)))
7144 : : {
7145 : 0 : if (dump_enabled_p ())
7146 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7147 : : "optimizing condition reduction with"
7148 : : " FOLD_EXTRACT_LAST.\n");
7149 : 0 : VECT_REDUC_INFO_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7150 : : }
7151 : 729 : else if (cond_reduc_dt == vect_induction_def)
7152 : : {
7153 : 112 : tree base
7154 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7155 : 112 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7156 : :
7157 : 112 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7158 : : && TREE_CODE (step) == INTEGER_CST);
7159 : 112 : cond_reduc_val = NULL_TREE;
7160 : 112 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7161 : 112 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7162 : 112 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7163 : : ;
7164 : : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7165 : : above base; punt if base is the minimum value of the type for
7166 : : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7167 : 100 : else if (tree_int_cst_sgn (step) == -1)
7168 : : {
7169 : 20 : cond_reduc_op_code = MIN_EXPR;
7170 : 20 : if (tree_int_cst_sgn (base) == -1)
7171 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7172 : 20 : else if (tree_int_cst_lt (base,
7173 : 20 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7174 : 20 : cond_reduc_val
7175 : 20 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7176 : : }
7177 : : else
7178 : : {
7179 : 80 : cond_reduc_op_code = MAX_EXPR;
7180 : 80 : if (tree_int_cst_sgn (base) == 1)
7181 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7182 : 80 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7183 : : base))
7184 : 80 : cond_reduc_val
7185 : 80 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7186 : : }
7187 : 100 : if (cond_reduc_val)
7188 : : {
7189 : 100 : if (dump_enabled_p ())
7190 : 64 : dump_printf_loc (MSG_NOTE, vect_location,
7191 : : "condition expression based on "
7192 : : "integer induction.\n");
7193 : 100 : VECT_REDUC_INFO_CODE (reduc_info) = cond_reduc_op_code;
7194 : 100 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info)
7195 : 100 : = cond_reduc_val;
7196 : 100 : VECT_REDUC_INFO_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7197 : : }
7198 : : }
7199 : 617 : else if (cond_reduc_dt == vect_constant_def)
7200 : : {
7201 : 99 : enum vect_def_type cond_initial_dt;
7202 : 99 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7203 : 99 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7204 : 99 : if (cond_initial_dt == vect_constant_def
7205 : 121 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7206 : 22 : TREE_TYPE (cond_reduc_val)))
7207 : : {
7208 : 22 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7209 : : cond_initial_val, cond_reduc_val);
7210 : 22 : if (e && (integer_onep (e) || integer_zerop (e)))
7211 : : {
7212 : 22 : if (dump_enabled_p ())
7213 : 16 : dump_printf_loc (MSG_NOTE, vect_location,
7214 : : "condition expression based on "
7215 : : "compile time constant.\n");
7216 : : /* Record reduction code at analysis stage. */
7217 : 22 : VECT_REDUC_INFO_CODE (reduc_info)
7218 : 22 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7219 : 22 : VECT_REDUC_INFO_TYPE (reduc_info) = CONST_COND_REDUCTION;
7220 : : }
7221 : : }
7222 : : }
7223 : : }
7224 : :
7225 : 44645 : if (STMT_VINFO_LIVE_P (phi_info))
7226 : : return false;
7227 : :
7228 : 44645 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
7229 : :
7230 : 44645 : gcc_assert (ncopies >= 1);
7231 : :
7232 : 44645 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7233 : :
7234 : : /* 4.2. Check support for the epilog operation.
7235 : :
7236 : : If STMT represents a reduction pattern, then the type of the
7237 : : reduction variable may be different than the type of the rest
7238 : : of the arguments. For example, consider the case of accumulation
7239 : : of shorts into an int accumulator; The original code:
7240 : : S1: int_a = (int) short_a;
7241 : : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7242 : :
7243 : : was replaced with:
7244 : : STMT: int_acc = widen_sum <short_a, int_acc>
7245 : :
7246 : : This means that:
7247 : : 1. The tree-code that is used to create the vector operation in the
7248 : : epilog code (that reduces the partial results) is not the
7249 : : tree-code of STMT, but is rather the tree-code of the original
7250 : : stmt from the pattern that STMT is replacing. I.e, in the example
7251 : : above we want to use 'widen_sum' in the loop, but 'plus' in the
7252 : : epilog.
7253 : : 2. The type (mode) we use to check available target support
7254 : : for the vector operation to be created in the *epilog*, is
7255 : : determined by the type of the reduction variable (in the example
7256 : : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7257 : : However the type (mode) we use to check available target support
7258 : : for the vector operation to be created *inside the loop*, is
7259 : : determined by the type of the other arguments to STMT (in the
7260 : : example we'd check this: optab_handler (widen_sum_optab,
7261 : : vect_short_mode)).
7262 : :
7263 : : This is contrary to "regular" reductions, in which the types of all
7264 : : the arguments are the same as the type of the reduction variable.
7265 : : For "regular" reductions we can therefore use the same vector type
7266 : : (and also the same tree-code) when generating the epilog code and
7267 : : when generating the code inside the loop. */
7268 : :
7269 : 44645 : code_helper orig_code = VECT_REDUC_INFO_CODE (reduc_info);
7270 : :
7271 : : /* If conversion might have created a conditional operation like
7272 : : IFN_COND_ADD already. Use the internal code for the following checks. */
7273 : 44645 : if (orig_code.is_internal_fn ())
7274 : : {
7275 : 3264 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7276 : 3264 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7277 : : }
7278 : :
7279 : 44645 : VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
7280 : :
7281 : 44645 : reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7282 : 44645 : if (reduction_type == TREE_CODE_REDUCTION)
7283 : : {
7284 : : /* Check whether it's ok to change the order of the computation.
7285 : : Generally, when vectorizing a reduction we change the order of the
7286 : : computation. This may change the behavior of the program in some
7287 : : cases, so we need to check that this is ok. One exception is when
7288 : : vectorizing an outer-loop: the inner-loop is executed sequentially,
7289 : : and therefore vectorizing reductions in the inner-loop during
7290 : : outer-loop vectorization is safe. Likewise when we are vectorizing
7291 : : a series of reductions using SLP and the VF is one the reductions
7292 : : are performed in scalar order. */
7293 : 43916 : if (!reduc_chain
7294 : 43916 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7295 : : ;
7296 : 43802 : else if (needs_fold_left_reduction_p (op.type, orig_code))
7297 : : {
7298 : : /* When vectorizing a reduction chain w/o SLP the reduction PHI
7299 : : is not directy used in stmt. */
7300 : 5018 : if (reduc_chain_length != 1)
7301 : : {
7302 : 67 : if (dump_enabled_p ())
7303 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7304 : : "in-order reduction chain without SLP.\n");
7305 : 67 : return false;
7306 : : }
7307 : : /* Code generation doesn't support function calls other
7308 : : than .COND_*. */
7309 : 4951 : if (!op.code.is_tree_code ()
7310 : 5053 : && !(op.code.is_internal_fn ()
7311 : 51 : && conditional_internal_fn_code (internal_fn (op.code))
7312 : : != ERROR_MARK))
7313 : : {
7314 : 10 : if (dump_enabled_p ())
7315 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7316 : : "in-order reduction chain operation not "
7317 : : "supported.\n");
7318 : 10 : return false;
7319 : : }
7320 : 4941 : VECT_REDUC_INFO_TYPE (reduc_info)
7321 : 4941 : = reduction_type = FOLD_LEFT_REDUCTION;
7322 : : }
7323 : 38784 : else if (!commutative_binary_op_p (orig_code, op.type)
7324 : 38784 : || !associative_binary_op_p (orig_code, op.type))
7325 : : {
7326 : 160 : if (dump_enabled_p ())
7327 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7328 : : "reduction: not commutative/associative\n");
7329 : 160 : return false;
7330 : : }
7331 : : }
7332 : :
7333 : 4941 : if ((reduction_type == COND_REDUCTION
7334 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7335 : : || reduction_type == CONST_COND_REDUCTION
7336 : 39467 : || reduction_type == EXTRACT_LAST_REDUCTION)
7337 : : && 1
7338 : 729 : && ncopies > 1)
7339 : : {
7340 : 310 : if (dump_enabled_p ())
7341 : 64 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7342 : : "multiple types in condition reduction.\n");
7343 : 310 : return false;
7344 : : }
7345 : :
7346 : 44098 : internal_fn reduc_fn = IFN_LAST;
7347 : 44098 : if (reduction_type == TREE_CODE_REDUCTION
7348 : 44098 : || reduction_type == FOLD_LEFT_REDUCTION
7349 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7350 : 419 : || reduction_type == CONST_COND_REDUCTION)
7351 : : {
7352 : 38852 : if (reduction_type == FOLD_LEFT_REDUCTION
7353 : 47944 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
7354 : 38852 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7355 : : {
7356 : 43117 : if (reduc_fn != IFN_LAST
7357 : 43117 : && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7358 : : OPTIMIZE_FOR_SPEED))
7359 : : {
7360 : 9388 : if (dump_enabled_p ())
7361 : 788 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7362 : : "reduc op not supported by target.\n");
7363 : :
7364 : 9388 : reduc_fn = IFN_LAST;
7365 : : }
7366 : : }
7367 : : else
7368 : : {
7369 : 676 : if (dump_enabled_p ())
7370 : 48 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7371 : : "no reduc code for scalar code.\n");
7372 : :
7373 : 676 : return false;
7374 : : }
7375 : : }
7376 : 305 : else if (reduction_type == COND_REDUCTION)
7377 : : {
7378 : 305 : int scalar_precision
7379 : 305 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7380 : 305 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
7381 : 305 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7382 : : vectype_out);
7383 : :
7384 : 305 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7385 : : OPTIMIZE_FOR_SPEED))
7386 : 7 : reduc_fn = IFN_REDUC_MAX;
7387 : : }
7388 : 43422 : VECT_REDUC_INFO_FN (reduc_info) = reduc_fn;
7389 : :
7390 : 43422 : if (reduction_type != EXTRACT_LAST_REDUCTION
7391 : : && reduc_fn == IFN_LAST
7392 : : && !nunits_out.is_constant ())
7393 : : {
7394 : : if (dump_enabled_p ())
7395 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7396 : : "missing target support for reduction on"
7397 : : " variable-length vectors.\n");
7398 : : return false;
7399 : : }
7400 : :
7401 : : /* For SLP reductions, see if there is a neutral value we can use. */
7402 : 43422 : tree neutral_op = NULL_TREE;
7403 : 43422 : tree initial_value = NULL_TREE;
7404 : 43422 : if (reduc_chain)
7405 : 2288 : initial_value = vect_phi_initial_value (reduc_def_phi);
7406 : 43422 : neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7407 : : orig_code, initial_value);
7408 : :
7409 : 43422 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7410 : : {
7411 : : /* We can't support in-order reductions of code such as this:
7412 : :
7413 : : for (int i = 0; i < n1; ++i)
7414 : : for (int j = 0; j < n2; ++j)
7415 : : l += a[j];
7416 : :
7417 : : since GCC effectively transforms the loop when vectorizing:
7418 : :
7419 : : for (int i = 0; i < n1 / VF; ++i)
7420 : : for (int j = 0; j < n2; ++j)
7421 : : for (int k = 0; k < VF; ++k)
7422 : : l += a[j];
7423 : :
7424 : : which is a reassociation of the original operation. */
7425 : 56 : if (dump_enabled_p ())
7426 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7427 : : "in-order double reduction not supported.\n");
7428 : :
7429 : 56 : return false;
7430 : : }
7431 : :
7432 : 43366 : if (reduction_type == FOLD_LEFT_REDUCTION
7433 : 4209 : && SLP_TREE_LANES (slp_node) > 1
7434 : 104 : && !reduc_chain)
7435 : : {
7436 : : /* We cannot use in-order reductions in this case because there is
7437 : : an implicit reassociation of the operations involved. */
7438 : 42 : if (dump_enabled_p ())
7439 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7440 : : "in-order unchained SLP reductions not supported.\n");
7441 : 42 : return false;
7442 : : }
7443 : :
7444 : : /* For double reductions, and for SLP reductions with a neutral value,
7445 : : we construct a variable-length initial vector by loading a vector
7446 : : full of the neutral value and then shift-and-inserting the start
7447 : : values into the low-numbered elements. */
7448 : 43324 : if ((double_reduc || neutral_op)
7449 : : && !nunits_out.is_constant ()
7450 : : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7451 : : vectype_out, OPTIMIZE_FOR_SPEED))
7452 : : {
7453 : : if (dump_enabled_p ())
7454 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7455 : : "reduction on variable-length vectors requires"
7456 : : " target support for a vector-shift-and-insert"
7457 : : " operation.\n");
7458 : : return false;
7459 : : }
7460 : :
7461 : : /* Check extra constraints for variable-length unchained SLP reductions. */
7462 : 43324 : if (!reduc_chain
7463 : : && !nunits_out.is_constant ())
7464 : : {
7465 : : /* We checked above that we could build the initial vector when
7466 : : there's a neutral element value. Check here for the case in
7467 : : which each SLP statement has its own initial value and in which
7468 : : that value needs to be repeated for every instance of the
7469 : : statement within the initial vector. */
7470 : : unsigned int group_size = SLP_TREE_LANES (slp_node);
7471 : : if (!neutral_op
7472 : : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7473 : : TREE_TYPE (vectype_out)))
7474 : : {
7475 : : if (dump_enabled_p ())
7476 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7477 : : "unsupported form of SLP reduction for"
7478 : : " variable-length vectors: cannot build"
7479 : : " initial vector.\n");
7480 : : return false;
7481 : : }
7482 : : /* The epilogue code relies on the number of elements being a multiple
7483 : : of the group size. The duplicate-and-interleave approach to setting
7484 : : up the initial vector does too. */
7485 : : if (!multiple_p (nunits_out, group_size))
7486 : : {
7487 : : if (dump_enabled_p ())
7488 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7489 : : "unsupported form of SLP reduction for"
7490 : : " variable-length vectors: the vector size"
7491 : : " is not a multiple of the number of results.\n");
7492 : : return false;
7493 : : }
7494 : : }
7495 : :
7496 : 43324 : if (reduction_type == COND_REDUCTION)
7497 : : {
7498 : 305 : widest_int ni;
7499 : :
7500 : 305 : if (! max_loop_iterations (loop, &ni))
7501 : : {
7502 : 0 : if (dump_enabled_p ())
7503 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
7504 : : "loop count not known, cannot create cond "
7505 : : "reduction.\n");
7506 : 0 : return false;
7507 : : }
7508 : : /* Convert backedges to iterations. */
7509 : 305 : ni += 1;
7510 : :
7511 : : /* The additional index will be the same type as the condition. Check
7512 : : that the loop can fit into this less one (because we'll use up the
7513 : : zero slot for when there are no matches). */
7514 : 305 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7515 : 305 : if (wi::geu_p (ni, wi::to_widest (max_index)))
7516 : : {
7517 : 90 : if (dump_enabled_p ())
7518 : 54 : dump_printf_loc (MSG_NOTE, vect_location,
7519 : : "loop size is greater than data size.\n");
7520 : 90 : return false;
7521 : : }
7522 : 305 : }
7523 : :
7524 : : /* In case the vectorization factor (VF) is bigger than the number
7525 : : of elements that we can fit in a vectype (nunits), we have to generate
7526 : : more than one vector stmt - i.e - we need to "unroll" the
7527 : : vector stmt by a factor VF/nunits. For more details see documentation
7528 : : in vectorizable_operation. */
7529 : :
7530 : : /* If the reduction is used in an outer loop we need to generate
7531 : : VF intermediate results, like so (e.g. for ncopies=2):
7532 : : r0 = phi (init, r0)
7533 : : r1 = phi (init, r1)
7534 : : r0 = x0 + r0;
7535 : : r1 = x1 + r1;
7536 : : (i.e. we generate VF results in 2 registers).
7537 : : In this case we have a separate def-use cycle for each copy, and therefore
7538 : : for each copy we get the vector def for the reduction variable from the
7539 : : respective phi node created for this copy.
7540 : :
7541 : : Otherwise (the reduction is unused in the loop nest), we can combine
7542 : : together intermediate results, like so (e.g. for ncopies=2):
7543 : : r = phi (init, r)
7544 : : r = x0 + r;
7545 : : r = x1 + r;
7546 : : (i.e. we generate VF/2 results in a single register).
7547 : : In this case for each copy we get the vector def for the reduction variable
7548 : : from the vectorized reduction operation generated in the previous iteration.
7549 : :
7550 : : This only works when we see both the reduction PHI and its only consumer
7551 : : in vectorizable_reduction and there are no intermediate stmts
7552 : : participating. When unrolling we want each unrolled iteration to have its
7553 : : own reduction accumulator since one of the main goals of unrolling a
7554 : : reduction is to reduce the aggregate loop-carried latency. */
7555 : 43234 : if (ncopies > 1
7556 : 43234 : && !reduc_chain
7557 : 5019 : && SLP_TREE_LANES (slp_node) == 1
7558 : 4942 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7559 : 4925 : && reduc_chain_length == 1
7560 : 4617 : && loop_vinfo->suggested_unroll_factor == 1)
7561 : 43234 : single_defuse_cycle = true;
7562 : :
7563 : 43234 : if (single_defuse_cycle && !lane_reducing)
7564 : : {
7565 : 4063 : gcc_assert (op.code != COND_EXPR);
7566 : :
7567 : : /* 4. check support for the operation in the loop
7568 : :
7569 : : This isn't necessary for the lane reduction codes, since they
7570 : : can only be produced by pattern matching, and it's up to the
7571 : : pattern matcher to test for support. The main reason for
7572 : : specifically skipping this step is to avoid rechecking whether
7573 : : mixed-sign dot-products can be implemented using signed
7574 : : dot-products. */
7575 : 4063 : machine_mode vec_mode = TYPE_MODE (vectype_in);
7576 : 4063 : if (!directly_supported_p (op.code, vectype_in, optab_vector))
7577 : : {
7578 : 781 : if (dump_enabled_p ())
7579 : 10 : dump_printf (MSG_NOTE, "op not supported by target.\n");
7580 : 1562 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7581 : 781 : || !vect_can_vectorize_without_simd_p (op.code))
7582 : : single_defuse_cycle = false;
7583 : : else
7584 : 5 : if (dump_enabled_p ())
7585 : 0 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7586 : : }
7587 : :
7588 : 4063 : if (vect_emulated_vector_p (vectype_in)
7589 : 4063 : && !vect_can_vectorize_without_simd_p (op.code))
7590 : : {
7591 : 0 : if (dump_enabled_p ())
7592 : 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
7593 : 0 : return false;
7594 : : }
7595 : : }
7596 : 43234 : if (dump_enabled_p () && single_defuse_cycle)
7597 : 638 : dump_printf_loc (MSG_NOTE, vect_location,
7598 : : "using single def-use cycle for reduction by reducing "
7599 : : "multiple vectors to one in the loop body\n");
7600 : 43234 : VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7601 : :
7602 : : /* For lane-reducing operation, the below processing related to single
7603 : : defuse-cycle will be done in its own vectorizable function. One more
7604 : : thing to note is that the operation must not be involved in fold-left
7605 : : reduction. */
7606 : 43234 : single_defuse_cycle &= !lane_reducing;
7607 : :
7608 : 43234 : if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
7609 : 23452 : for (i = 0; i < (int) op.num_ops; i++)
7610 : 16050 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7611 : : {
7612 : 0 : if (dump_enabled_p ())
7613 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7614 : : "incompatible vector types for invariants\n");
7615 : 0 : return false;
7616 : : }
7617 : :
7618 : 43234 : vect_model_reduction_cost (loop_vinfo, slp_for_stmt_info, reduc_fn,
7619 : : reduction_type, ncopies, cost_vec);
7620 : : /* Cost the reduction op inside the loop if transformed via
7621 : : vect_transform_reduction for non-lane-reducing operation. Otherwise
7622 : : this is costed by the separate vectorizable_* routines. */
7623 : 43234 : if (single_defuse_cycle)
7624 : 3287 : record_stmt_cost (cost_vec, ncopies, vector_stmt,
7625 : : slp_for_stmt_info, 0, vect_body);
7626 : :
7627 : 43234 : if (dump_enabled_p ()
7628 : 43234 : && reduction_type == FOLD_LEFT_REDUCTION)
7629 : 202 : dump_printf_loc (MSG_NOTE, vect_location,
7630 : : "using an in-order (fold-left) reduction.\n");
7631 : 43234 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7632 : :
7633 : : /* All but single defuse-cycle optimized and fold-left reductions go
7634 : : through their own vectorizable_* routines. */
7635 : 43234 : stmt_vec_info tem
7636 : 43234 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (slp_node_instance));
7637 : 43234 : if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
7638 : 35832 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7639 : : else
7640 : : {
7641 : 7402 : STMT_VINFO_DEF_TYPE (tem) = vect_reduction_def;
7642 : 7402 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7643 : 4 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7644 : : slp_node, op.code, op.type,
7645 : : vectype_in);
7646 : : }
7647 : : return true;
7648 : : }
7649 : :
7650 : : /* STMT_INFO is a dot-product reduction whose multiplication operands
7651 : : have different signs. Emit a sequence to emulate the operation
7652 : : using a series of signed DOT_PROD_EXPRs and return the last
7653 : : statement generated. VEC_DEST is the result of the vector operation
7654 : : and VOP lists its inputs. */
7655 : :
7656 : : static gassign *
7657 : 2 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7658 : : gimple_stmt_iterator *gsi, tree vec_dest,
7659 : : tree vop[3])
7660 : : {
7661 : 2 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7662 : 2 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7663 : 2 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
7664 : 2 : gimple *new_stmt;
7665 : :
7666 : : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7667 : 2 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7668 : 0 : std::swap (vop[0], vop[1]);
7669 : :
7670 : : /* Convert all inputs to signed types. */
7671 : 8 : for (int i = 0; i < 3; ++i)
7672 : 6 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7673 : : {
7674 : 2 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7675 : 2 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7676 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7677 : 2 : vop[i] = tmp;
7678 : : }
7679 : :
7680 : : /* In the comments below we assume 8-bit inputs for simplicity,
7681 : : but the approach works for any full integer type. */
7682 : :
7683 : : /* Create a vector of -128. */
7684 : 2 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7685 : 2 : tree min_narrow = build_vector_from_val (narrow_vectype,
7686 : : min_narrow_elttype);
7687 : :
7688 : : /* Create a vector of 64. */
7689 : 2 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7690 : 2 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7691 : 2 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7692 : :
7693 : : /* Emit: SUB_RES = VOP[0] - 128. */
7694 : 2 : tree sub_res = make_ssa_name (narrow_vectype);
7695 : 2 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7696 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7697 : :
7698 : : /* Emit:
7699 : :
7700 : : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7701 : : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7702 : : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7703 : :
7704 : : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7705 : : Doing the two 64 * y steps first allows more time to compute x. */
7706 : 2 : tree stage1 = make_ssa_name (wide_vectype);
7707 : 2 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7708 : : vop[1], half_narrow, vop[2]);
7709 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7710 : :
7711 : 2 : tree stage2 = make_ssa_name (wide_vectype);
7712 : 2 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7713 : : vop[1], half_narrow, stage1);
7714 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7715 : :
7716 : 2 : tree stage3 = make_ssa_name (wide_vectype);
7717 : 2 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7718 : : sub_res, vop[1], stage2);
7719 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7720 : :
7721 : : /* Convert STAGE3 to the reduction type. */
7722 : 2 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7723 : 2 : }
7724 : :
7725 : : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7726 : : value. */
7727 : :
7728 : : bool
7729 : 2360 : vect_transform_reduction (loop_vec_info loop_vinfo,
7730 : : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7731 : : slp_tree slp_node)
7732 : : {
7733 : 2360 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
7734 : 2360 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7735 : 2360 : unsigned vec_num;
7736 : :
7737 : 2360 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
7738 : :
7739 : 2360 : if (nested_in_vect_loop_p (loop, stmt_info))
7740 : : {
7741 : 0 : loop = loop->inner;
7742 : 0 : gcc_assert (VECT_REDUC_INFO_DEF_TYPE (reduc_info)
7743 : : == vect_double_reduction_def);
7744 : : }
7745 : :
7746 : 2360 : gimple_match_op op;
7747 : 2360 : if (!gimple_extract_op (stmt_info->stmt, &op))
7748 : 0 : gcc_unreachable ();
7749 : :
7750 : : /* All uses but the last are expected to be defined in the loop.
7751 : : The last use is the reduction variable. In case of nested cycle this
7752 : : assumption is not true: we use reduc_index to record the index of the
7753 : : reduction variable. */
7754 : 2360 : int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
7755 : 2360 : tree vectype_in = SLP_TREE_VECTYPE (slp_node);
7756 : 2360 : if (lane_reducing_op_p (op.code))
7757 : 244 : vectype_in = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0]);
7758 : :
7759 : 2360 : vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
7760 : :
7761 : 2360 : code_helper code = canonicalize_code (op.code, op.type);
7762 : 2360 : internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
7763 : :
7764 : 2360 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7765 : 2360 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7766 : 2360 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7767 : :
7768 : : /* Transform. */
7769 : 2360 : tree new_temp = NULL_TREE;
7770 : 16520 : auto_vec<tree> vec_oprnds[3];
7771 : :
7772 : 2360 : if (dump_enabled_p ())
7773 : 700 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7774 : :
7775 : : /* A binary COND_OP reduction must have the same definition and else
7776 : : value. */
7777 : 2656 : bool cond_fn_p = code.is_internal_fn ()
7778 : 296 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
7779 : 296 : if (cond_fn_p)
7780 : : {
7781 : 296 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
7782 : : || code == IFN_COND_MUL || code == IFN_COND_AND
7783 : : || code == IFN_COND_IOR || code == IFN_COND_XOR
7784 : : || code == IFN_COND_MIN || code == IFN_COND_MAX);
7785 : 296 : gcc_assert (op.num_ops == 4
7786 : : && (op.ops[reduc_index]
7787 : : == op.ops[internal_fn_else_index ((internal_fn) code)]));
7788 : : }
7789 : :
7790 : 2360 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7791 : :
7792 : 2360 : vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7793 : 2360 : if (reduction_type == FOLD_LEFT_REDUCTION)
7794 : : {
7795 : 840 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
7796 : 840 : gcc_assert (code.is_tree_code () || cond_fn_p);
7797 : 840 : return vectorize_fold_left_reduction
7798 : 840 : (loop_vinfo, stmt_info, gsi, slp_node,
7799 : 840 : code, reduc_fn, op.num_ops, vectype_in,
7800 : 840 : reduc_index, masks, lens);
7801 : : }
7802 : :
7803 : 1520 : bool single_defuse_cycle = VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
7804 : 1520 : bool lane_reducing = lane_reducing_op_p (code);
7805 : 1276 : gcc_assert (single_defuse_cycle || lane_reducing);
7806 : :
7807 : 1520 : if (lane_reducing)
7808 : : {
7809 : : /* The last operand of lane-reducing op is for reduction. */
7810 : 244 : gcc_assert (reduc_index == (int) op.num_ops - 1);
7811 : : }
7812 : :
7813 : : /* Create the destination vector */
7814 : 1520 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
7815 : 1520 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7816 : :
7817 : : /* Get NCOPIES vector definitions for all operands except the reduction
7818 : : definition. */
7819 : 1520 : if (!cond_fn_p)
7820 : : {
7821 : 1233 : gcc_assert (reduc_index >= 0 && reduc_index <= 2);
7822 : 2055 : vect_get_vec_defs (loop_vinfo, slp_node,
7823 : 1233 : single_defuse_cycle && reduc_index == 0
7824 : : ? NULL_TREE : op.ops[0], &vec_oprnds[0],
7825 : 1233 : single_defuse_cycle && reduc_index == 1
7826 : : ? NULL_TREE : op.ops[1], &vec_oprnds[1],
7827 : 1233 : op.num_ops == 3
7828 : 244 : && !(single_defuse_cycle && reduc_index == 2)
7829 : : ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
7830 : : }
7831 : : else
7832 : : {
7833 : : /* For a conditional operation pass the truth type as mask
7834 : : vectype. */
7835 : 287 : gcc_assert (single_defuse_cycle
7836 : : && (reduc_index == 1 || reduc_index == 2));
7837 : 287 : vect_get_vec_defs (loop_vinfo, slp_node, op.ops[0],
7838 : : &vec_oprnds[0],
7839 : : reduc_index == 1 ? NULL_TREE : op.ops[1],
7840 : : &vec_oprnds[1],
7841 : : reduc_index == 2 ? NULL_TREE : op.ops[2],
7842 : : &vec_oprnds[2]);
7843 : : }
7844 : :
7845 : : /* For single def-use cycles get one copy of the vectorized reduction
7846 : : definition. */
7847 : 1520 : if (single_defuse_cycle)
7848 : : {
7849 : 1440 : vect_get_vec_defs (loop_vinfo, slp_node,
7850 : : reduc_index == 0 ? op.ops[0] : NULL_TREE,
7851 : : &vec_oprnds[0],
7852 : : reduc_index == 1 ? op.ops[1] : NULL_TREE,
7853 : : &vec_oprnds[1],
7854 : : reduc_index == 2 ? op.ops[2] : NULL_TREE,
7855 : : &vec_oprnds[2]);
7856 : : }
7857 : 80 : else if (lane_reducing)
7858 : : {
7859 : : /* For normal reduction, consistency between vectorized def/use is
7860 : : naturally ensured when mapping from scalar statement. But if lane-
7861 : : reducing op is involved in reduction, thing would become somewhat
7862 : : complicated in that the op's result and operand for accumulation are
7863 : : limited to less lanes than other operands, which certainly causes
7864 : : def/use mismatch on adjacent statements around the op if do not have
7865 : : any kind of specific adjustment. One approach is to refit lane-
7866 : : reducing op in the way of introducing new trivial pass-through copies
7867 : : to fix possible def/use gap, so as to make it behave like a normal op.
7868 : : And vector reduction PHIs are always generated to the full extent, no
7869 : : matter lane-reducing op exists or not. If some copies or PHIs are
7870 : : actually superfluous, they would be cleaned up by passes after
7871 : : vectorization. An example for single-lane slp, lane-reducing ops
7872 : : with mixed input vectypes in a reduction chain, is given as below.
7873 : : Similarly, this handling is applicable for multiple-lane slp as well.
7874 : :
7875 : : int sum = 1;
7876 : : for (i)
7877 : : {
7878 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
7879 : : sum += w[i]; // widen-sum <vector(16) char>
7880 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
7881 : : sum += n[i]; // normal <vector(4) int>
7882 : : }
7883 : :
7884 : : The vector size is 128-bit,vectorization factor is 16. Reduction
7885 : : statements would be transformed as:
7886 : :
7887 : : vector<4> int sum_v0 = { 0, 0, 0, 1 };
7888 : : vector<4> int sum_v1 = { 0, 0, 0, 0 };
7889 : : vector<4> int sum_v2 = { 0, 0, 0, 0 };
7890 : : vector<4> int sum_v3 = { 0, 0, 0, 0 };
7891 : :
7892 : : for (i / 16)
7893 : : {
7894 : : sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
7895 : : sum_v1 = sum_v1; // copy
7896 : : sum_v2 = sum_v2; // copy
7897 : : sum_v3 = sum_v3; // copy
7898 : :
7899 : : sum_v0 = sum_v0; // copy
7900 : : sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
7901 : : sum_v2 = sum_v2; // copy
7902 : : sum_v3 = sum_v3; // copy
7903 : :
7904 : : sum_v0 = sum_v0; // copy
7905 : : sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
7906 : : sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
7907 : : sum_v3 = sum_v3; // copy
7908 : :
7909 : : sum_v0 += n_v0[i: 0 ~ 3 ];
7910 : : sum_v1 += n_v1[i: 4 ~ 7 ];
7911 : : sum_v2 += n_v2[i: 8 ~ 11];
7912 : : sum_v3 += n_v3[i: 12 ~ 15];
7913 : : }
7914 : :
7915 : : Moreover, for a higher instruction parallelism in final vectorized
7916 : : loop, it is considered to make those effective vector lane-reducing
7917 : : ops be distributed evenly among all def-use cycles. In the above
7918 : : example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
7919 : : cycles, instruction dependency among them could be eliminated. */
7920 : 80 : unsigned effec_ncopies = vec_oprnds[0].length ();
7921 : 80 : unsigned total_ncopies = vec_oprnds[reduc_index].length ();
7922 : :
7923 : 80 : gcc_assert (effec_ncopies <= total_ncopies);
7924 : :
7925 : 80 : if (effec_ncopies < total_ncopies)
7926 : : {
7927 : 240 : for (unsigned i = 0; i < op.num_ops - 1; i++)
7928 : : {
7929 : 320 : gcc_assert (vec_oprnds[i].length () == effec_ncopies);
7930 : 160 : vec_oprnds[i].safe_grow_cleared (total_ncopies);
7931 : : }
7932 : : }
7933 : :
7934 : 80 : tree reduc_vectype_in = vectype_in;
7935 : 80 : gcc_assert (reduc_vectype_in);
7936 : :
7937 : 80 : unsigned effec_reduc_ncopies
7938 : 80 : = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
7939 : :
7940 : 80 : gcc_assert (effec_ncopies <= effec_reduc_ncopies);
7941 : :
7942 : 80 : if (effec_ncopies < effec_reduc_ncopies)
7943 : : {
7944 : : /* Find suitable def-use cycles to generate vectorized statements
7945 : : into, and reorder operands based on the selection. */
7946 : 0 : unsigned curr_pos = VECT_REDUC_INFO_RESULT_POS (reduc_info);
7947 : 0 : unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
7948 : :
7949 : 0 : gcc_assert (curr_pos < effec_reduc_ncopies);
7950 : 0 : VECT_REDUC_INFO_RESULT_POS (reduc_info) = next_pos;
7951 : :
7952 : 0 : if (curr_pos)
7953 : : {
7954 : 0 : unsigned count = effec_reduc_ncopies - effec_ncopies;
7955 : 0 : unsigned start = curr_pos - count;
7956 : :
7957 : 0 : if ((int) start < 0)
7958 : : {
7959 : 0 : count = curr_pos;
7960 : 0 : start = 0;
7961 : : }
7962 : :
7963 : 0 : for (unsigned i = 0; i < op.num_ops - 1; i++)
7964 : : {
7965 : 0 : for (unsigned j = effec_ncopies; j > start; j--)
7966 : : {
7967 : 0 : unsigned k = j - 1;
7968 : 0 : std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
7969 : 0 : gcc_assert (!vec_oprnds[i][k]);
7970 : : }
7971 : : }
7972 : : }
7973 : : }
7974 : : }
7975 : :
7976 : 1520 : bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (slp_node);
7977 : 2542 : unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
7978 : 1520 : unsigned mask_index = 0;
7979 : :
7980 : 6709 : for (unsigned i = 0; i < num; ++i)
7981 : : {
7982 : 5189 : gimple *new_stmt;
7983 : 5189 : tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
7984 : 5189 : if (!vop[0] || !vop[1])
7985 : : {
7986 : 422 : tree reduc_vop = vec_oprnds[reduc_index][i];
7987 : :
7988 : : /* If could not generate an effective vector statement for current
7989 : : portion of reduction operand, insert a trivial copy to simply
7990 : : handle over the operand to other dependent statements. */
7991 : 422 : gcc_assert (reduc_vop);
7992 : :
7993 : 422 : if (TREE_CODE (reduc_vop) == SSA_NAME
7994 : 422 : && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
7995 : 422 : new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
7996 : : else
7997 : : {
7998 : 0 : new_temp = make_ssa_name (vec_dest);
7999 : 0 : new_stmt = gimple_build_assign (new_temp, reduc_vop);
8000 : 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8001 : : gsi);
8002 : : }
8003 : : }
8004 : 4767 : else if (masked_loop_p && !mask_by_cond_expr)
8005 : : {
8006 : : /* No conditional ifns have been defined for lane-reducing op
8007 : : yet. */
8008 : 8 : gcc_assert (!lane_reducing);
8009 : :
8010 : : /* Make sure that the reduction accumulator is vop[0]. */
8011 : 8 : if (reduc_index == 1)
8012 : : {
8013 : 8 : gcc_assert (commutative_binary_op_p (code, op.type));
8014 : 8 : std::swap (vop[0], vop[1]);
8015 : : }
8016 : 8 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8017 : : vec_num, vectype_in,
8018 : : mask_index++);
8019 : 8 : gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8020 : : vop[0], vop[1], vop[0]);
8021 : 8 : new_temp = make_ssa_name (vec_dest, call);
8022 : 8 : gimple_call_set_lhs (call, new_temp);
8023 : 8 : gimple_call_set_nothrow (call, true);
8024 : 8 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8025 : 8 : new_stmt = call;
8026 : : }
8027 : : else
8028 : : {
8029 : 4759 : if (op.num_ops >= 3)
8030 : 1235 : vop[2] = vec_oprnds[2][i];
8031 : :
8032 : 4759 : if (masked_loop_p && mask_by_cond_expr)
8033 : : {
8034 : 4 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8035 : : vec_num, vectype_in,
8036 : : mask_index++);
8037 : 4 : build_vect_cond_expr (code, vop, mask, gsi);
8038 : : }
8039 : :
8040 : 4759 : if (emulated_mixed_dot_prod)
8041 : 2 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8042 : : vec_dest, vop);
8043 : :
8044 : 5605 : else if (code.is_internal_fn () && !cond_fn_p)
8045 : 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8046 : : op.num_ops,
8047 : : vop[0], vop[1], vop[2]);
8048 : 5605 : else if (code.is_internal_fn () && cond_fn_p)
8049 : 848 : new_stmt = gimple_build_call_internal (internal_fn (code),
8050 : : op.num_ops,
8051 : : vop[0], vop[1], vop[2],
8052 : : vop[reduc_index]);
8053 : : else
8054 : 3909 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8055 : : vop[0], vop[1], vop[2]);
8056 : 4759 : new_temp = make_ssa_name (vec_dest, new_stmt);
8057 : 4759 : gimple_set_lhs (new_stmt, new_temp);
8058 : 4759 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8059 : : }
8060 : :
8061 : 5189 : if (single_defuse_cycle && i < num - 1)
8062 : 3117 : vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8063 : : else
8064 : 2072 : slp_node->push_vec_def (new_stmt);
8065 : : }
8066 : :
8067 : : return true;
8068 : 9440 : }
8069 : :
8070 : : /* Transform phase of a cycle PHI. */
8071 : :
8072 : : bool
8073 : 22825 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8074 : : stmt_vec_info stmt_info,
8075 : : slp_tree slp_node, slp_instance slp_node_instance)
8076 : : {
8077 : 22825 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
8078 : 22825 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8079 : 22825 : int i;
8080 : 22825 : bool nested_cycle = false;
8081 : 22825 : int vec_num;
8082 : :
8083 : 22934 : if (nested_in_vect_loop_p (loop, stmt_info))
8084 : : {
8085 : : loop = loop->inner;
8086 : : nested_cycle = true;
8087 : : }
8088 : :
8089 : 22825 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
8090 : 22825 : if (reduc_info
8091 : 22238 : && (VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8092 : 22238 : || VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION))
8093 : : /* Leave the scalar phi in place. */
8094 : : return true;
8095 : :
8096 : 21985 : vec_num = vect_get_num_copies (loop_vinfo, slp_node);
8097 : :
8098 : : /* Check whether we should use a single PHI node and accumulate
8099 : : vectors to one before the backedge. */
8100 : 21985 : if (reduc_info && VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info))
8101 : 21985 : vec_num = 1;
8102 : :
8103 : : /* Create the destination vector */
8104 : 21985 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8105 : 21985 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8106 : : vectype_out);
8107 : :
8108 : : /* Get the loop-entry arguments. */
8109 : 21985 : tree vec_initial_def = NULL_TREE;
8110 : 21985 : auto_vec<tree> vec_initial_defs;
8111 : 21985 : vec_initial_defs.reserve (vec_num);
8112 : : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8113 : : and we can't use zero for induc_val, use initial_def. Similarly
8114 : : for REDUC_MIN and initial_def larger than the base. */
8115 : 21985 : if (reduc_info
8116 : 21398 : && VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8117 : : {
8118 : 66 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8119 : 66 : tree initial_def = vect_phi_initial_value (phi);
8120 : 66 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).safe_push (initial_def);
8121 : 66 : tree induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
8122 : 66 : if (TREE_CODE (initial_def) == INTEGER_CST
8123 : 64 : && !integer_zerop (induc_val)
8124 : 130 : && ((VECT_REDUC_INFO_CODE (reduc_info) == MAX_EXPR
8125 : 44 : && tree_int_cst_lt (initial_def, induc_val))
8126 : 61 : || (VECT_REDUC_INFO_CODE (reduc_info) == MIN_EXPR
8127 : 20 : && tree_int_cst_lt (induc_val, initial_def))))
8128 : : {
8129 : 3 : induc_val = initial_def;
8130 : : /* Communicate we used the initial_def to epilouge
8131 : : generation. */
8132 : 3 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8133 : : }
8134 : 66 : vec_initial_defs.quick_push
8135 : 66 : (build_vector_from_val (vectype_out, induc_val));
8136 : 66 : }
8137 : 21919 : else if (nested_cycle)
8138 : : {
8139 : 661 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8140 : 661 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8141 : : &vec_initial_defs);
8142 : : }
8143 : : else
8144 : : {
8145 : 21258 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8146 : 21258 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
8147 : 21258 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8148 : :
8149 : 21258 : unsigned int num_phis = stmts.length ();
8150 : 21258 : if (reduc_info->is_reduc_chain)
8151 : 2050 : num_phis = 1;
8152 : 21258 : initial_values.reserve (num_phis);
8153 : 42821 : for (unsigned int i = 0; i < num_phis; ++i)
8154 : : {
8155 : 21563 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8156 : 21563 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8157 : : }
8158 : 21258 : if (vec_num == 1)
8159 : 18850 : vect_find_reusable_accumulator (loop_vinfo, reduc_info, vectype_out);
8160 : 21258 : if (!initial_values.is_empty ())
8161 : : {
8162 : 21071 : tree initial_value
8163 : 41967 : = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8164 : 21071 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
8165 : 21071 : tree neutral_op
8166 : 21071 : = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8167 : : code, initial_value);
8168 : : /* Try to simplify the vector initialization by applying an
8169 : : adjustment after the reduction has been performed. This
8170 : : can also break a critical path but on the other hand
8171 : : requires to keep the initial value live across the loop. */
8172 : 21071 : if (neutral_op
8173 : 20986 : && initial_values.length () == 1
8174 : 20825 : && !VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
8175 : 17630 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8176 : 38627 : && !operand_equal_p (neutral_op, initial_values[0]))
8177 : : {
8178 : 12733 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info)
8179 : 12733 : = initial_values[0];
8180 : 12733 : initial_values[0] = neutral_op;
8181 : : }
8182 : 42142 : get_initial_defs_for_reduction (loop_vinfo, reduc_info, vectype_out,
8183 : : &vec_initial_defs, vec_num,
8184 : : stmts.length (), neutral_op);
8185 : : }
8186 : : }
8187 : :
8188 : 21985 : if (vec_initial_def)
8189 : : {
8190 : 0 : vec_initial_defs.create (1);
8191 : 0 : vec_initial_defs.quick_push (vec_initial_def);
8192 : : }
8193 : :
8194 : 21985 : if (reduc_info)
8195 : 21398 : if (auto *accumulator = VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
8196 : : {
8197 : 3396 : tree def = accumulator->reduc_input;
8198 : 3396 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8199 : : {
8200 : 3394 : unsigned int nreduc;
8201 : 6788 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8202 : 3394 : (TREE_TYPE (def)),
8203 : 3394 : TYPE_VECTOR_SUBPARTS (vectype_out),
8204 : : &nreduc);
8205 : 0 : gcc_assert (res);
8206 : 3394 : gimple_seq stmts = NULL;
8207 : : /* Reduce the single vector to a smaller one. */
8208 : 3394 : if (nreduc != 1)
8209 : : {
8210 : : /* Perform the reduction in the appropriate type. */
8211 : 3394 : tree rvectype = vectype_out;
8212 : 3394 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8213 : 3394 : TREE_TYPE (TREE_TYPE (def))))
8214 : 229 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8215 : : TYPE_VECTOR_SUBPARTS
8216 : 458 : (vectype_out));
8217 : 3394 : def = vect_create_partial_epilog (def, rvectype,
8218 : : VECT_REDUC_INFO_CODE
8219 : : (reduc_info),
8220 : : &stmts);
8221 : : }
8222 : : /* The epilogue loop might use a different vector mode, like
8223 : : VNx2DI vs. V2DI. */
8224 : 3394 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8225 : : {
8226 : 0 : tree reduc_type = build_vector_type_for_mode
8227 : 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8228 : 0 : def = gimple_convert (&stmts, reduc_type, def);
8229 : : }
8230 : : /* Adjust the input so we pick up the partially reduced value
8231 : : for the skip edge in vect_create_epilog_for_reduction. */
8232 : 3394 : accumulator->reduc_input = def;
8233 : : /* And the reduction could be carried out using a different sign. */
8234 : 3394 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8235 : 229 : def = gimple_convert (&stmts, vectype_out, def);
8236 : 3394 : edge e;
8237 : 3394 : if ((e = loop_vinfo->main_loop_edge)
8238 : 3394 : || (e = loop_vinfo->skip_this_loop_edge))
8239 : : {
8240 : : /* While we'd like to insert on the edge this will split
8241 : : blocks and disturb bookkeeping, we also will eventually
8242 : : need this on the skip edge. Rely on sinking to
8243 : : fixup optimal placement and insert in the pred. */
8244 : 3207 : gimple_stmt_iterator gsi = gsi_last_bb (e->src);
8245 : : /* Insert before a cond that eventually skips the
8246 : : epilogue. */
8247 : 3207 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8248 : 3194 : gsi_prev (&gsi);
8249 : 3207 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8250 : : }
8251 : : else
8252 : 187 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8253 : : stmts);
8254 : : }
8255 : 3396 : if (loop_vinfo->main_loop_edge)
8256 : 3209 : vec_initial_defs[0]
8257 : 3209 : = vect_get_main_loop_result (loop_vinfo, def,
8258 : 3209 : vec_initial_defs[0]);
8259 : : else
8260 : 187 : vec_initial_defs.safe_push (def);
8261 : : }
8262 : :
8263 : : /* Generate the reduction PHIs upfront. */
8264 : 47542 : for (i = 0; i < vec_num; i++)
8265 : : {
8266 : 25557 : tree vec_init_def = vec_initial_defs[i];
8267 : : /* Create the reduction-phi that defines the reduction
8268 : : operand. */
8269 : 25557 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
8270 : 25557 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8271 : : UNKNOWN_LOCATION);
8272 : :
8273 : : /* The loop-latch arg is set in epilogue processing. */
8274 : :
8275 : 25557 : slp_node->push_vec_def (new_phi);
8276 : : }
8277 : :
8278 : 21985 : return true;
8279 : 21985 : }
8280 : :
8281 : : /* Vectorizes LC PHIs. */
8282 : :
8283 : : bool
8284 : 156011 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
8285 : : stmt_vec_info stmt_info,
8286 : : slp_tree slp_node)
8287 : : {
8288 : 156011 : if (!loop_vinfo
8289 : 156011 : || !is_a <gphi *> (stmt_info->stmt)
8290 : 187198 : || gimple_phi_num_args (stmt_info->stmt) != 1)
8291 : : return false;
8292 : :
8293 : 695 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8294 : 0 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8295 : : return false;
8296 : :
8297 : : /* Deal with copies from externs or constants that disguise as
8298 : : loop-closed PHI nodes (PR97886). */
8299 : 695 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8300 : : SLP_TREE_VECTYPE (slp_node)))
8301 : : {
8302 : 0 : if (dump_enabled_p ())
8303 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8304 : : "incompatible vector types for invariants\n");
8305 : 0 : return false;
8306 : : }
8307 : :
8308 : : /* ??? This can happen with data vs. mask uses of boolean. */
8309 : 695 : if (!useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
8310 : 695 : SLP_TREE_VECTYPE
8311 : : (SLP_TREE_CHILDREN (slp_node)[0])))
8312 : : {
8313 : 0 : if (dump_enabled_p ())
8314 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8315 : : "missed mask promotion\n");
8316 : 0 : return false;
8317 : : }
8318 : :
8319 : 695 : SLP_TREE_TYPE (slp_node) = lc_phi_info_type;
8320 : 695 : return true;
8321 : : }
8322 : :
8323 : : bool
8324 : 438 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
8325 : : stmt_vec_info stmt_info,
8326 : : slp_tree slp_node)
8327 : : {
8328 : :
8329 : 438 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8330 : 438 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8331 : 438 : basic_block bb = gimple_bb (stmt_info->stmt);
8332 : 438 : edge e = single_pred_edge (bb);
8333 : 438 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8334 : 438 : auto_vec<tree> vec_oprnds;
8335 : 876 : vect_get_vec_defs (loop_vinfo, slp_node,
8336 : 438 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8337 : 979 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
8338 : : {
8339 : : /* Create the vectorized LC PHI node. */
8340 : 541 : gphi *new_phi = create_phi_node (vec_dest, bb);
8341 : 541 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8342 : 541 : slp_node->push_vec_def (new_phi);
8343 : : }
8344 : :
8345 : 438 : return true;
8346 : 438 : }
8347 : :
8348 : : /* Vectorizes PHIs. */
8349 : :
8350 : : bool
8351 : 150258 : vectorizable_phi (bb_vec_info vinfo,
8352 : : stmt_vec_info stmt_info,
8353 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8354 : : {
8355 : 150258 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8356 : : return false;
8357 : :
8358 : 78874 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8359 : : return false;
8360 : :
8361 : 78874 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8362 : :
8363 : 78874 : if (cost_vec) /* transformation not required. */
8364 : : {
8365 : : slp_tree child;
8366 : : unsigned i;
8367 : 198913 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8368 : 135382 : if (!child)
8369 : : {
8370 : 0 : if (dump_enabled_p ())
8371 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8372 : : "PHI node with unvectorized backedge def\n");
8373 : 0 : return false;
8374 : : }
8375 : 135382 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8376 : : {
8377 : 27 : if (dump_enabled_p ())
8378 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8379 : : "incompatible vector types for invariants\n");
8380 : 27 : return false;
8381 : : }
8382 : 135355 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8383 : 135355 : && !useless_type_conversion_p (vectype,
8384 : : SLP_TREE_VECTYPE (child)))
8385 : : {
8386 : : /* With bools we can have mask and non-mask precision vectors
8387 : : or different non-mask precisions. while pattern recog is
8388 : : supposed to guarantee consistency here bugs in it can cause
8389 : : mismatches (PR103489 and PR103800 for example).
8390 : : Deal with them here instead of ICEing later. */
8391 : 18 : if (dump_enabled_p ())
8392 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8393 : : "incompatible vector type setup from "
8394 : : "bool pattern detection\n");
8395 : 18 : return false;
8396 : : }
8397 : :
8398 : : /* For single-argument PHIs assume coalescing which means zero cost
8399 : : for the scalar and the vector PHIs. This avoids artificially
8400 : : favoring the vector path (but may pessimize it in some cases). */
8401 : 63531 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8402 : 53606 : record_stmt_cost (cost_vec, vect_get_num_copies (vinfo, slp_node),
8403 : : vector_stmt, slp_node, vectype, 0, vect_body);
8404 : 63531 : SLP_TREE_TYPE (slp_node) = phi_info_type;
8405 : 63531 : return true;
8406 : : }
8407 : :
8408 : 15298 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8409 : 15298 : basic_block bb = gimple_bb (stmt_info->stmt);
8410 : 15298 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8411 : 15298 : auto_vec<gphi *> new_phis;
8412 : 51414 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8413 : : {
8414 : 36116 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8415 : :
8416 : : /* Skip not yet vectorized defs. */
8417 : 36520 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8418 : 36116 : && SLP_TREE_VEC_DEFS (child).is_empty ())
8419 : 404 : continue;
8420 : :
8421 : 35712 : auto_vec<tree> vec_oprnds;
8422 : 35712 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8423 : 35712 : if (!new_phis.exists ())
8424 : : {
8425 : 15298 : new_phis.create (vec_oprnds.length ());
8426 : 32274 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8427 : : {
8428 : : /* Create the vectorized LC PHI node. */
8429 : 16976 : new_phis.quick_push (create_phi_node (vec_dest, bb));
8430 : 16976 : slp_node->push_vec_def (new_phis[j]);
8431 : : }
8432 : : }
8433 : 35712 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8434 : 76492 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8435 : 40780 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8436 : 35712 : }
8437 : : /* We should have at least one already vectorized child. */
8438 : 15298 : gcc_assert (new_phis.exists ());
8439 : :
8440 : 15298 : return true;
8441 : 15298 : }
8442 : :
8443 : : /* Vectorizes first order recurrences. An overview of the transformation
8444 : : is described below. Suppose we have the following loop.
8445 : :
8446 : : int t = 0;
8447 : : for (int i = 0; i < n; ++i)
8448 : : {
8449 : : b[i] = a[i] - t;
8450 : : t = a[i];
8451 : : }
8452 : :
8453 : : There is a first-order recurrence on 'a'. For this loop, the scalar IR
8454 : : looks (simplified) like:
8455 : :
8456 : : scalar.preheader:
8457 : : init = 0;
8458 : :
8459 : : scalar.body:
8460 : : i = PHI <0(scalar.preheader), i+1(scalar.body)>
8461 : : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8462 : : _1 = a[i]
8463 : : b[i] = _1 - _2
8464 : : if (i < n) goto scalar.body
8465 : :
8466 : : In this example, _2 is a recurrence because it's value depends on the
8467 : : previous iteration. We vectorize this as (VF = 4)
8468 : :
8469 : : vector.preheader:
8470 : : vect_init = vect_cst(..., ..., ..., 0)
8471 : :
8472 : : vector.body
8473 : : i = PHI <0(vector.preheader), i+4(vector.body)>
8474 : : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8475 : : vect_2 = a[i, i+1, i+2, i+3];
8476 : : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8477 : : b[i, i+1, i+2, i+3] = vect_2 - vect_3
8478 : : if (..) goto vector.body
8479 : :
8480 : : In this function, vectorizable_recurr, we code generate both the
8481 : : vector PHI node and the permute since those together compute the
8482 : : vectorized value of the scalar PHI. We do not yet have the
8483 : : backedge value to fill in there nor into the vec_perm. Those
8484 : : are filled in vect_schedule_scc.
8485 : :
8486 : : TODO: Since the scalar loop does not have a use of the recurrence
8487 : : outside of the loop the natural way to implement peeling via
8488 : : vectorizing the live value doesn't work. For now peeling of loops
8489 : : with a recurrence is not implemented. For SLP the supported cases
8490 : : are restricted to those requiring a single vector recurrence PHI. */
8491 : :
8492 : : bool
8493 : 155358 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8494 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8495 : : {
8496 : 155358 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8497 : : return false;
8498 : :
8499 : 30534 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
8500 : :
8501 : : /* So far we only support first-order recurrence auto-vectorization. */
8502 : 30534 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8503 : : return false;
8504 : :
8505 : 412 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8506 : 412 : unsigned ncopies = vect_get_num_copies (loop_vinfo, slp_node);
8507 : 412 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8508 : 412 : unsigned dist = SLP_TREE_LANES (slp_node);
8509 : : /* We need to be able to make progress with a single vector. */
8510 : 412 : if (maybe_gt (dist * 2, nunits))
8511 : : {
8512 : 0 : if (dump_enabled_p ())
8513 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8514 : : "first order recurrence exceeds half of "
8515 : : "a vector\n");
8516 : 0 : return false;
8517 : : }
8518 : :
8519 : : /* We need to be able to build a { ..., a, b } init vector with
8520 : : dist number of distinct trailing values. Always possible
8521 : : when dist == 1 or when nunits is constant or when the initializations
8522 : : are uniform. */
8523 : 412 : tree uniform_initval = NULL_TREE;
8524 : 412 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8525 : 1672 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8526 : : {
8527 : 448 : gphi *phi = as_a <gphi *> (s->stmt);
8528 : 448 : if (! uniform_initval)
8529 : 412 : uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8530 : 36 : else if (! operand_equal_p (uniform_initval,
8531 : 36 : PHI_ARG_DEF_FROM_EDGE (phi, pe)))
8532 : : {
8533 : : uniform_initval = NULL_TREE;
8534 : : break;
8535 : : }
8536 : : }
8537 : 412 : if (!uniform_initval && !nunits.is_constant ())
8538 : : {
8539 : : if (dump_enabled_p ())
8540 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8541 : : "cannot build initialization vector for "
8542 : : "first order recurrence\n");
8543 : : return false;
8544 : : }
8545 : :
8546 : : /* First-order recurrence autovectorization needs to handle permutation
8547 : : with indices = [nunits-1, nunits, nunits+1, ...]. */
8548 : 412 : vec_perm_builder sel (nunits, 1, 3);
8549 : 1648 : for (int i = 0; i < 3; ++i)
8550 : 1236 : sel.quick_push (nunits - dist + i);
8551 : 412 : vec_perm_indices indices (sel, 2, nunits);
8552 : :
8553 : 412 : if (cost_vec) /* transformation not required. */
8554 : : {
8555 : 370 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8556 : : indices))
8557 : : return false;
8558 : :
8559 : : /* We eventually need to set a vector type on invariant
8560 : : arguments. */
8561 : : unsigned j;
8562 : : slp_tree child;
8563 : 774 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8564 : 516 : if (!vect_maybe_update_slp_op_vectype (child, vectype))
8565 : : {
8566 : 0 : if (dump_enabled_p ())
8567 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8568 : : "incompatible vector types for "
8569 : : "invariants\n");
8570 : 0 : return false;
8571 : : }
8572 : :
8573 : : /* Verify we have set up compatible types. */
8574 : 258 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8575 : 258 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
8576 : 258 : tree latch_vectype = SLP_TREE_VECTYPE (latch_def);
8577 : 258 : if (!types_compatible_p (latch_vectype, vectype))
8578 : : return false;
8579 : :
8580 : : /* The recurrence costs the initialization vector and one permute
8581 : : for each copy. With SLP the prologue value is explicitly
8582 : : represented and costed separately. */
8583 : 226 : unsigned prologue_cost = 0;
8584 : 226 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8585 : : slp_node, 0, vect_body);
8586 : 226 : if (dump_enabled_p ())
8587 : 50 : dump_printf_loc (MSG_NOTE, vect_location,
8588 : : "vectorizable_recurr: inside_cost = %d, "
8589 : : "prologue_cost = %d .\n", inside_cost,
8590 : : prologue_cost);
8591 : :
8592 : 226 : SLP_TREE_TYPE (slp_node) = recurr_info_type;
8593 : 226 : return true;
8594 : : }
8595 : :
8596 : 42 : tree vec_init;
8597 : 42 : if (! uniform_initval)
8598 : : {
8599 : 6 : vec<constructor_elt, va_gc> *v = NULL;
8600 : 6 : vec_alloc (v, nunits.to_constant ());
8601 : 33 : for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
8602 : 27 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
8603 : : build_zero_cst (TREE_TYPE (vectype)));
8604 : 39 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8605 : : {
8606 : 21 : gphi *phi = as_a <gphi *> (s->stmt);
8607 : 21 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8608 : 21 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
8609 : 21 : TREE_TYPE (preheader)))
8610 : : {
8611 : 0 : gimple_seq stmts = NULL;
8612 : 0 : preheader = gimple_convert (&stmts,
8613 : 0 : TREE_TYPE (vectype), preheader);
8614 : 0 : gsi_insert_seq_on_edge_immediate (pe, stmts);
8615 : : }
8616 : 21 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
8617 : : }
8618 : 6 : vec_init = build_constructor (vectype, v);
8619 : : }
8620 : : else
8621 : : vec_init = uniform_initval;
8622 : 42 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8623 : :
8624 : : /* Create the vectorized first-order PHI node. */
8625 : 42 : tree vec_dest = vect_get_new_vect_var (vectype,
8626 : : vect_simple_var, "vec_recur_");
8627 : 42 : basic_block bb = gimple_bb (phi);
8628 : 42 : gphi *new_phi = create_phi_node (vec_dest, bb);
8629 : 42 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8630 : :
8631 : : /* Insert shuffles the first-order recurrence autovectorization.
8632 : : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8633 : 42 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
8634 : :
8635 : : /* Insert the required permute after the latch definition. The
8636 : : second and later operands are tentative and will be updated when we have
8637 : : vectorized the latch definition. */
8638 : 42 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8639 : 42 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8640 : 42 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8641 : 42 : gsi_next (&gsi2);
8642 : :
8643 : 121 : for (unsigned i = 0; i < ncopies; ++i)
8644 : : {
8645 : 79 : vec_dest = make_ssa_name (vectype);
8646 : 79 : gassign *vperm
8647 : 121 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8648 : 42 : i == 0 ? gimple_phi_result (new_phi) : NULL,
8649 : : NULL, perm);
8650 : 79 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8651 : :
8652 : 79 : slp_node->push_vec_def (vperm);
8653 : : }
8654 : :
8655 : : return true;
8656 : 412 : }
8657 : :
8658 : : /* Return true if VECTYPE represents a vector that requires lowering
8659 : : by the vector lowering pass. */
8660 : :
8661 : : bool
8662 : 618352 : vect_emulated_vector_p (tree vectype)
8663 : : {
8664 : 1236704 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8665 : 620981 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8666 : 2611 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8667 : : }
8668 : :
8669 : : /* Return true if we can emulate CODE on an integer mode representation
8670 : : of a vector. */
8671 : :
8672 : : bool
8673 : 10681 : vect_can_vectorize_without_simd_p (tree_code code)
8674 : : {
8675 : 10681 : switch (code)
8676 : : {
8677 : : case PLUS_EXPR:
8678 : : case MINUS_EXPR:
8679 : : case NEGATE_EXPR:
8680 : : case BIT_AND_EXPR:
8681 : : case BIT_IOR_EXPR:
8682 : : case BIT_XOR_EXPR:
8683 : : case BIT_NOT_EXPR:
8684 : : return true;
8685 : :
8686 : 9912 : default:
8687 : 9912 : return false;
8688 : : }
8689 : : }
8690 : :
8691 : : /* Likewise, but taking a code_helper. */
8692 : :
8693 : : bool
8694 : 235 : vect_can_vectorize_without_simd_p (code_helper code)
8695 : : {
8696 : 235 : return (code.is_tree_code ()
8697 : 235 : && vect_can_vectorize_without_simd_p (tree_code (code)));
8698 : : }
8699 : :
8700 : : /* Create vector init for vectorized iv. */
8701 : : static tree
8702 : 833 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8703 : : tree step_expr, poly_uint64 nunits,
8704 : : tree vectype,
8705 : : enum vect_induction_op_type induction_type)
8706 : : {
8707 : 833 : unsigned HOST_WIDE_INT const_nunits;
8708 : 833 : tree vec_shift, vec_init, new_name;
8709 : 833 : unsigned i;
8710 : 833 : tree itype = TREE_TYPE (vectype);
8711 : :
8712 : : /* iv_loop is the loop to be vectorized. Create:
8713 : : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8714 : 833 : new_name = gimple_convert (stmts, itype, init_expr);
8715 : 833 : switch (induction_type)
8716 : : {
8717 : 18 : case vect_step_op_shr:
8718 : 18 : case vect_step_op_shl:
8719 : : /* Build the Initial value from shift_expr. */
8720 : 18 : vec_init = gimple_build_vector_from_val (stmts,
8721 : : vectype,
8722 : : new_name);
8723 : 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8724 : : build_zero_cst (itype), step_expr);
8725 : 18 : vec_init = gimple_build (stmts,
8726 : : (induction_type == vect_step_op_shr
8727 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
8728 : : vectype, vec_init, vec_shift);
8729 : 18 : break;
8730 : :
8731 : 739 : case vect_step_op_neg:
8732 : 739 : {
8733 : 739 : vec_init = gimple_build_vector_from_val (stmts,
8734 : : vectype,
8735 : : new_name);
8736 : 739 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8737 : : vectype, vec_init);
8738 : : /* The encoding has 2 interleaved stepped patterns. */
8739 : 739 : vec_perm_builder sel (nunits, 2, 3);
8740 : 739 : sel.quick_grow (6);
8741 : 3695 : for (i = 0; i < 3; i++)
8742 : : {
8743 : 2217 : sel[2 * i] = i;
8744 : 2217 : sel[2 * i + 1] = i + nunits;
8745 : : }
8746 : 739 : vec_perm_indices indices (sel, 2, nunits);
8747 : : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8748 : : fail when vec_init is const vector. In that situation vec_perm is not
8749 : : really needed. */
8750 : 739 : tree perm_mask_even
8751 : 739 : = vect_gen_perm_mask_any (vectype, indices);
8752 : 739 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8753 : : vectype,
8754 : : vec_init, vec_neg,
8755 : : perm_mask_even);
8756 : 739 : }
8757 : 739 : break;
8758 : :
8759 : 76 : case vect_step_op_mul:
8760 : 76 : {
8761 : : /* Use unsigned mult to avoid UD integer overflow. */
8762 : 76 : gcc_assert (nunits.is_constant (&const_nunits));
8763 : 76 : tree utype = unsigned_type_for (itype);
8764 : 76 : tree uvectype = build_vector_type (utype,
8765 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
8766 : 76 : new_name = gimple_convert (stmts, utype, new_name);
8767 : 76 : vec_init = gimple_build_vector_from_val (stmts,
8768 : : uvectype,
8769 : : new_name);
8770 : 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
8771 : 76 : tree elt_step = build_one_cst (utype);
8772 : :
8773 : 76 : elts.quick_push (elt_step);
8774 : 660 : for (i = 1; i < const_nunits; i++)
8775 : : {
8776 : : /* Create: new_name_i = new_name + step_expr. */
8777 : 508 : elt_step = gimple_build (stmts, MULT_EXPR,
8778 : : utype, elt_step, step_expr);
8779 : 508 : elts.quick_push (elt_step);
8780 : : }
8781 : : /* Create a vector from [new_name_0, new_name_1, ...,
8782 : : new_name_nunits-1]. */
8783 : 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
8784 : 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8785 : : vec_init, vec_mul);
8786 : 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
8787 : 76 : }
8788 : 76 : break;
8789 : :
8790 : 0 : default:
8791 : 0 : gcc_unreachable ();
8792 : : }
8793 : :
8794 : 833 : return vec_init;
8795 : : }
8796 : :
8797 : : /* Peel init_expr by skip_niter for induction_type. */
8798 : : tree
8799 : 84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8800 : : tree skip_niters, tree step_expr,
8801 : : enum vect_induction_op_type induction_type)
8802 : : {
8803 : 84 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
8804 : 84 : tree type = TREE_TYPE (init_expr);
8805 : 84 : unsigned prec = TYPE_PRECISION (type);
8806 : 84 : switch (induction_type)
8807 : : {
8808 : 0 : case vect_step_op_neg:
8809 : 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
8810 : 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
8811 : : /* else no change. */
8812 : : break;
8813 : :
8814 : 12 : case vect_step_op_shr:
8815 : 12 : case vect_step_op_shl:
8816 : 12 : skip_niters = gimple_convert (stmts, type, skip_niters);
8817 : 12 : step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
8818 : : /* When shift mount >= precision, need to avoid UD.
8819 : : In the original loop, there's no UD, and according to semantic,
8820 : : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
8821 : 12 : if (!tree_fits_uhwi_p (step_expr)
8822 : 12 : || tree_to_uhwi (step_expr) >= prec)
8823 : : {
8824 : 6 : if (induction_type == vect_step_op_shl
8825 : 6 : || TYPE_UNSIGNED (type))
8826 : 4 : init_expr = build_zero_cst (type);
8827 : : else
8828 : 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
8829 : : init_expr,
8830 : 4 : wide_int_to_tree (type, prec - 1));
8831 : : }
8832 : : else
8833 : 8 : init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
8834 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
8835 : : type, init_expr, step_expr);
8836 : : break;
8837 : :
8838 : 72 : case vect_step_op_mul:
8839 : 72 : {
8840 : 72 : tree utype = unsigned_type_for (type);
8841 : 72 : init_expr = gimple_convert (stmts, utype, init_expr);
8842 : 72 : wide_int skipn = wi::to_wide (skip_niters);
8843 : 72 : wide_int begin = wi::to_wide (step_expr);
8844 : 72 : auto_mpz base, exp, mod, res;
8845 : 72 : wi::to_mpz (begin, base, TYPE_SIGN (type));
8846 : 72 : wi::to_mpz (skipn, exp, UNSIGNED);
8847 : 72 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
8848 : 72 : mpz_powm (res, base, exp, mod);
8849 : 72 : begin = wi::from_mpz (utype, res, true);
8850 : 72 : tree mult_expr = wide_int_to_tree (utype, begin);
8851 : 72 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
8852 : : init_expr, mult_expr);
8853 : 72 : init_expr = gimple_convert (stmts, type, init_expr);
8854 : 72 : }
8855 : 72 : break;
8856 : :
8857 : 0 : default:
8858 : 0 : gcc_unreachable ();
8859 : : }
8860 : :
8861 : 84 : return init_expr;
8862 : : }
8863 : :
8864 : : /* Create vector step for vectorized iv. */
8865 : : static tree
8866 : 1068 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
8867 : : poly_uint64 vf,
8868 : : enum vect_induction_op_type induction_type)
8869 : : {
8870 : 1068 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
8871 : 1068 : tree new_name = NULL;
8872 : : /* Step should be pow (step, vf) for mult induction. */
8873 : 1068 : if (induction_type == vect_step_op_mul)
8874 : : {
8875 : 76 : gcc_assert (vf.is_constant ());
8876 : 76 : wide_int begin = wi::to_wide (step_expr);
8877 : :
8878 : 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
8879 : 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
8880 : :
8881 : 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
8882 : 76 : }
8883 : 992 : else if (induction_type == vect_step_op_neg)
8884 : : /* Do nothing. */
8885 : : ;
8886 : : else
8887 : 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
8888 : : expr, step_expr);
8889 : 1068 : return new_name;
8890 : : }
8891 : :
8892 : : static tree
8893 : 1068 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
8894 : : stmt_vec_info stmt_info,
8895 : : tree new_name, tree vectype,
8896 : : enum vect_induction_op_type induction_type)
8897 : : {
8898 : : /* No step is needed for neg induction. */
8899 : 1068 : if (induction_type == vect_step_op_neg)
8900 : : return NULL;
8901 : :
8902 : 94 : tree t = unshare_expr (new_name);
8903 : 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
8904 : : || TREE_CODE (new_name) == SSA_NAME);
8905 : 94 : tree new_vec = build_vector_from_val (vectype, t);
8906 : 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
8907 : : new_vec, vectype, NULL);
8908 : 94 : return vec_step;
8909 : : }
8910 : :
8911 : : /* Update vectorized iv with vect_step, induc_def is init. */
8912 : : static tree
8913 : 1250 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
8914 : : tree induc_def, tree vec_step,
8915 : : enum vect_induction_op_type induction_type)
8916 : : {
8917 : 1250 : tree vec_def = induc_def;
8918 : 1250 : switch (induction_type)
8919 : : {
8920 : 76 : case vect_step_op_mul:
8921 : 76 : {
8922 : : /* Use unsigned mult to avoid UD integer overflow. */
8923 : 76 : tree uvectype
8924 : 76 : = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
8925 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
8926 : 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
8927 : 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
8928 : 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
8929 : : vec_def, vec_step);
8930 : 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
8931 : : }
8932 : 76 : break;
8933 : :
8934 : 12 : case vect_step_op_shr:
8935 : 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
8936 : : vec_def, vec_step);
8937 : 12 : break;
8938 : :
8939 : 6 : case vect_step_op_shl:
8940 : 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
8941 : : vec_def, vec_step);
8942 : 6 : break;
8943 : : case vect_step_op_neg:
8944 : : vec_def = induc_def;
8945 : : /* Do nothing. */
8946 : : break;
8947 : 0 : default:
8948 : 0 : gcc_unreachable ();
8949 : : }
8950 : :
8951 : 1250 : return vec_def;
8952 : :
8953 : : }
8954 : :
8955 : : /* Function vectorizable_nonlinear_induction
8956 : :
8957 : : Check if STMT_INFO performs an nonlinear induction computation that can be
8958 : : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
8959 : : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
8960 : : basic block.
8961 : : Return true if STMT_INFO is vectorizable in this way. */
8962 : :
8963 : : static bool
8964 : 10045 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
8965 : : stmt_vec_info stmt_info,
8966 : : slp_tree slp_node,
8967 : : stmt_vector_for_cost *cost_vec)
8968 : : {
8969 : 10045 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8970 : 10045 : unsigned ncopies;
8971 : 10045 : bool nested_in_vect_loop = false;
8972 : 10045 : class loop *iv_loop;
8973 : 10045 : tree vec_def;
8974 : 10045 : edge pe = loop_preheader_edge (loop);
8975 : 10045 : basic_block new_bb;
8976 : 10045 : tree vec_init, vec_step;
8977 : 10045 : tree new_name;
8978 : 10045 : gimple *new_stmt;
8979 : 10045 : gphi *induction_phi;
8980 : 10045 : tree induc_def, vec_dest;
8981 : 10045 : tree init_expr, step_expr;
8982 : 10045 : tree niters_skip;
8983 : 10045 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8984 : 10045 : unsigned i;
8985 : 10045 : gimple_stmt_iterator si;
8986 : :
8987 : 10045 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8988 : :
8989 : 10045 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8990 : 10045 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8991 : 10045 : enum vect_induction_op_type induction_type
8992 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
8993 : :
8994 : 10045 : gcc_assert (induction_type > vect_step_op_add);
8995 : :
8996 : 10045 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
8997 : 10045 : gcc_assert (ncopies >= 1);
8998 : :
8999 : : /* FORNOW. Only handle nonlinear induction in the same loop. */
9000 : 10045 : if (nested_in_vect_loop_p (loop, stmt_info))
9001 : : {
9002 : 0 : if (dump_enabled_p ())
9003 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9004 : : "nonlinear induction in nested loop.\n");
9005 : 0 : return false;
9006 : : }
9007 : :
9008 : 10045 : iv_loop = loop;
9009 : 10045 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9010 : :
9011 : : /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
9012 : : vector iv update for each iv and a permutation to generate wanted
9013 : : vector iv. */
9014 : 10045 : if (SLP_TREE_LANES (slp_node) > 1)
9015 : : {
9016 : 0 : if (dump_enabled_p ())
9017 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9018 : : "SLP induction not supported for nonlinear"
9019 : : " induction.\n");
9020 : 0 : return false;
9021 : : }
9022 : :
9023 : 10045 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9024 : : {
9025 : 0 : if (dump_enabled_p ())
9026 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9027 : : "floating point nonlinear induction vectorization"
9028 : : " not supported.\n");
9029 : 0 : return false;
9030 : : }
9031 : :
9032 : 10045 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9033 : 10045 : init_expr = vect_phi_initial_value (phi);
9034 : 10045 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9035 : : && TREE_CODE (step_expr) == INTEGER_CST);
9036 : : /* step_expr should be aligned with init_expr,
9037 : : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9038 : 10045 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9039 : :
9040 : 10045 : if (TREE_CODE (init_expr) == INTEGER_CST)
9041 : 2490 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9042 : 7555 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9043 : : {
9044 : : /* INIT_EXPR could be a bit_field, bail out for such case. */
9045 : 4 : if (dump_enabled_p ())
9046 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9047 : : "nonlinear induction vectorization failed:"
9048 : : " component type of vectype is not a nop conversion"
9049 : : " from type of init_expr.\n");
9050 : 4 : return false;
9051 : : }
9052 : :
9053 : 10041 : switch (induction_type)
9054 : : {
9055 : 2207 : case vect_step_op_neg:
9056 : 2207 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9057 : : return false;
9058 : 2203 : if (TREE_CODE (init_expr) != INTEGER_CST
9059 : 190 : && TREE_CODE (init_expr) != REAL_CST)
9060 : : {
9061 : : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9062 : 190 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9063 : 0 : return false;
9064 : :
9065 : : /* The encoding has 2 interleaved stepped patterns. */
9066 : 190 : vec_perm_builder sel (nunits, 2, 3);
9067 : 190 : machine_mode mode = TYPE_MODE (vectype);
9068 : 190 : sel.quick_grow (6);
9069 : 950 : for (i = 0; i < 3; i++)
9070 : : {
9071 : 570 : sel[i * 2] = i;
9072 : 570 : sel[i * 2 + 1] = i + nunits;
9073 : : }
9074 : 190 : vec_perm_indices indices (sel, 2, nunits);
9075 : 190 : if (!can_vec_perm_const_p (mode, mode, indices))
9076 : 0 : return false;
9077 : 190 : }
9078 : : break;
9079 : :
9080 : 724 : case vect_step_op_mul:
9081 : 724 : {
9082 : : /* Check for backend support of MULT_EXPR. */
9083 : 724 : if (!directly_supported_p (MULT_EXPR, vectype))
9084 : : return false;
9085 : :
9086 : : /* ?? How to construct vector step for variable number vector.
9087 : : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9088 : : if (!vf.is_constant ())
9089 : : return false;
9090 : : }
9091 : : break;
9092 : :
9093 : 7014 : case vect_step_op_shr:
9094 : : /* Check for backend support of RSHIFT_EXPR. */
9095 : 7014 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9096 : : return false;
9097 : :
9098 : : /* Don't shift more than type precision to avoid UD. */
9099 : 26 : if (!tree_fits_uhwi_p (step_expr)
9100 : 26 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9101 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9102 : : return false;
9103 : : break;
9104 : :
9105 : 96 : case vect_step_op_shl:
9106 : : /* Check for backend support of RSHIFT_EXPR. */
9107 : 96 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9108 : : return false;
9109 : :
9110 : : /* Don't shift more than type precision to avoid UD. */
9111 : 12 : if (!tree_fits_uhwi_p (step_expr)
9112 : 12 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9113 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9114 : : return false;
9115 : :
9116 : : break;
9117 : :
9118 : 0 : default:
9119 : 0 : gcc_unreachable ();
9120 : : }
9121 : :
9122 : 2811 : if (cost_vec) /* transformation not required. */
9123 : : {
9124 : 1978 : unsigned inside_cost = 0, prologue_cost = 0;
9125 : : /* loop cost for vec_loop. Neg induction doesn't have any
9126 : : inside_cost. */
9127 : 1978 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9128 : : slp_node, 0, vect_body);
9129 : :
9130 : : /* loop cost for vec_loop. Neg induction doesn't have any
9131 : : inside_cost. */
9132 : 1978 : if (induction_type == vect_step_op_neg)
9133 : 1464 : inside_cost = 0;
9134 : :
9135 : : /* prologue cost for vec_init and vec_step. */
9136 : 1978 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9137 : : slp_node, 0, vect_prologue);
9138 : :
9139 : 1978 : if (dump_enabled_p ())
9140 : 60 : dump_printf_loc (MSG_NOTE, vect_location,
9141 : : "vect_model_induction_cost: inside_cost = %d, "
9142 : : "prologue_cost = %d. \n", inside_cost,
9143 : : prologue_cost);
9144 : :
9145 : 1978 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9146 : 1978 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9147 : 1978 : return true;
9148 : : }
9149 : :
9150 : : /* Transform. */
9151 : :
9152 : : /* Compute a vector variable, initialized with the first VF values of
9153 : : the induction variable. E.g., for an iv with IV_PHI='X' and
9154 : : evolution S, for a vector of 4 units, we want to compute:
9155 : : [X, X + S, X + 2*S, X + 3*S]. */
9156 : :
9157 : 833 : if (dump_enabled_p ())
9158 : 32 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9159 : :
9160 : 833 : pe = loop_preheader_edge (iv_loop);
9161 : : /* Find the first insertion point in the BB. */
9162 : 833 : basic_block bb = gimple_bb (phi);
9163 : 833 : si = gsi_after_labels (bb);
9164 : :
9165 : 833 : gimple_seq stmts = NULL;
9166 : :
9167 : 833 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9168 : : /* If we are using the loop mask to "peel" for alignment then we need
9169 : : to adjust the start value here. */
9170 : 833 : if (niters_skip != NULL_TREE)
9171 : 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9172 : : step_expr, induction_type);
9173 : :
9174 : 833 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9175 : : step_expr, nunits, vectype,
9176 : : induction_type);
9177 : 833 : if (stmts)
9178 : : {
9179 : 162 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9180 : 162 : gcc_assert (!new_bb);
9181 : : }
9182 : :
9183 : 833 : stmts = NULL;
9184 : 833 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9185 : : vf, induction_type);
9186 : 833 : if (stmts)
9187 : : {
9188 : 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9189 : 0 : gcc_assert (!new_bb);
9190 : : }
9191 : :
9192 : 833 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9193 : : new_name, vectype,
9194 : : induction_type);
9195 : : /* Create the following def-use cycle:
9196 : : loop prolog:
9197 : : vec_init = ...
9198 : : vec_step = ...
9199 : : loop:
9200 : : vec_iv = PHI <vec_init, vec_loop>
9201 : : ...
9202 : : STMT
9203 : : ...
9204 : : vec_loop = vec_iv + vec_step; */
9205 : :
9206 : : /* Create the induction-phi that defines the induction-operand. */
9207 : 833 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9208 : 833 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9209 : 833 : induc_def = PHI_RESULT (induction_phi);
9210 : :
9211 : : /* Create the iv update inside the loop. */
9212 : 833 : stmts = NULL;
9213 : 833 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9214 : : induc_def, vec_step,
9215 : : induction_type);
9216 : :
9217 : 833 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9218 : 833 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9219 : :
9220 : : /* Set the arguments of the phi node: */
9221 : 833 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9222 : 833 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9223 : : UNKNOWN_LOCATION);
9224 : :
9225 : 833 : slp_node->push_vec_def (induction_phi);
9226 : :
9227 : : /* In case that vectorization factor (VF) is bigger than the number
9228 : : of elements that we can fit in a vectype (nunits), we have to generate
9229 : : more than one vector stmt - i.e - we need to "unroll" the
9230 : : vector stmt by a factor VF/nunits. For more details see documentation
9231 : : in vectorizable_operation. */
9232 : :
9233 : 833 : if (ncopies > 1)
9234 : : {
9235 : 235 : stmts = NULL;
9236 : : /* FORNOW. This restriction should be relaxed. */
9237 : 235 : gcc_assert (!nested_in_vect_loop);
9238 : :
9239 : 235 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9240 : : nunits, induction_type);
9241 : :
9242 : 235 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9243 : : new_name, vectype,
9244 : : induction_type);
9245 : 235 : vec_def = induc_def;
9246 : 887 : for (i = 1; i < ncopies; i++)
9247 : : {
9248 : : /* vec_i = vec_prev + vec_step. */
9249 : 417 : stmts = NULL;
9250 : 417 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9251 : : vec_def, vec_step,
9252 : : induction_type);
9253 : 417 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9254 : 417 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9255 : 417 : slp_node->push_vec_def (new_stmt);
9256 : : }
9257 : : }
9258 : :
9259 : 833 : if (dump_enabled_p ())
9260 : 64 : dump_printf_loc (MSG_NOTE, vect_location,
9261 : : "transform induction: created def-use cycle: %G%G",
9262 : 32 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9263 : :
9264 : : return true;
9265 : : }
9266 : :
9267 : : /* Function vectorizable_induction
9268 : :
9269 : : Check if STMT_INFO performs an induction computation that can be vectorized.
9270 : : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9271 : : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9272 : : Return true if STMT_INFO is vectorizable in this way. */
9273 : :
9274 : : bool
9275 : 267389 : vectorizable_induction (loop_vec_info loop_vinfo,
9276 : : stmt_vec_info stmt_info,
9277 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9278 : : {
9279 : 267389 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9280 : 267389 : bool nested_in_vect_loop = false;
9281 : 267389 : class loop *iv_loop;
9282 : 267389 : tree vec_def;
9283 : 267389 : edge pe = loop_preheader_edge (loop);
9284 : 267389 : basic_block new_bb;
9285 : 267389 : tree vec_init = NULL_TREE, vec_step, t;
9286 : 267389 : tree new_name;
9287 : 267389 : gphi *induction_phi;
9288 : 267389 : tree induc_def, vec_dest;
9289 : 267389 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9290 : 267389 : unsigned i;
9291 : 267389 : tree expr;
9292 : 267389 : tree index_vectype = NULL_TREE;
9293 : 267389 : gimple_stmt_iterator si;
9294 : 267389 : enum vect_induction_op_type induction_type
9295 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9296 : :
9297 : 291342 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9298 : 142565 : if (!phi)
9299 : : return false;
9300 : :
9301 : 142565 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
9302 : : return false;
9303 : :
9304 : : /* Make sure it was recognized as induction computation. */
9305 : 142565 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9306 : : return false;
9307 : :
9308 : : /* Handle nonlinear induction in a separate place. */
9309 : 138412 : if (induction_type != vect_step_op_add)
9310 : 10045 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9311 : 10045 : slp_node, cost_vec);
9312 : :
9313 : 128367 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9314 : 128367 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9315 : :
9316 : : /* FORNOW. These restrictions should be relaxed. */
9317 : 128367 : if (nested_in_vect_loop_p (loop, stmt_info))
9318 : : {
9319 : 584 : imm_use_iterator imm_iter;
9320 : 584 : use_operand_p use_p;
9321 : 584 : gimple *exit_phi;
9322 : 584 : edge latch_e;
9323 : 584 : tree loop_arg;
9324 : :
9325 : 584 : exit_phi = NULL;
9326 : 584 : latch_e = loop_latch_edge (loop->inner);
9327 : 584 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9328 : 1210 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9329 : : {
9330 : 636 : gimple *use_stmt = USE_STMT (use_p);
9331 : 636 : if (is_gimple_debug (use_stmt))
9332 : 36 : continue;
9333 : :
9334 : 600 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9335 : : {
9336 : : exit_phi = use_stmt;
9337 : : break;
9338 : : }
9339 : : }
9340 : 584 : if (exit_phi)
9341 : : {
9342 : 10 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9343 : 10 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9344 : 6 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9345 : : {
9346 : 4 : if (dump_enabled_p ())
9347 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9348 : : "inner-loop induction only used outside "
9349 : : "of the outer vectorized loop.\n");
9350 : 4 : return false;
9351 : : }
9352 : : }
9353 : :
9354 : 580 : nested_in_vect_loop = true;
9355 : 580 : iv_loop = loop->inner;
9356 : : }
9357 : : else
9358 : : iv_loop = loop;
9359 : 128363 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9360 : :
9361 : 128363 : if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
9362 : : {
9363 : : /* The current SLP code creates the step value element-by-element. */
9364 : : if (dump_enabled_p ())
9365 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9366 : : "SLP induction not supported for variable-length"
9367 : : " vectors.\n");
9368 : : return false;
9369 : : }
9370 : :
9371 : 128363 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9372 : : {
9373 : 12 : if (dump_enabled_p ())
9374 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9375 : : "floating point induction vectorization disabled\n");
9376 : 12 : return false;
9377 : : }
9378 : :
9379 : 128351 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9380 : 128351 : gcc_assert (step_expr != NULL_TREE);
9381 : 256656 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
9382 : 256564 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
9383 : : {
9384 : 12 : if (dump_enabled_p ())
9385 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9386 : : "bit-precision induction vectorization not "
9387 : : "supported.\n");
9388 : 12 : return false;
9389 : : }
9390 : 128339 : tree stept = TREE_TYPE (step_expr);
9391 : 128339 : tree step_vectype = get_same_sized_vectype (stept, vectype);
9392 : 128339 : stept = TREE_TYPE (step_vectype);
9393 : :
9394 : : /* Check for target support of the vectorized arithmetic used here. */
9395 : 128339 : if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
9396 : 128339 : || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
9397 : 19772 : return false;
9398 : 108567 : if (!nunits.is_constant ())
9399 : : {
9400 : : if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
9401 : : return false;
9402 : : /* FLOAT_EXPR when computing VEC_INIT for float inductions. */
9403 : : if (SCALAR_FLOAT_TYPE_P (stept))
9404 : : {
9405 : : tree index_type = build_nonstandard_integer_type
9406 : : (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
9407 : :
9408 : : index_vectype = build_vector_type (index_type, nunits);
9409 : : if (!can_float_p (TYPE_MODE (step_vectype),
9410 : : TYPE_MODE (index_vectype), 1))
9411 : : return false;
9412 : : }
9413 : : }
9414 : :
9415 : 108567 : unsigned nvects = vect_get_num_copies (loop_vinfo, slp_node);
9416 : 108567 : if (cost_vec) /* transformation not required. */
9417 : : {
9418 : 276291 : unsigned inside_cost = 0, prologue_cost = 0;
9419 : : /* We eventually need to set a vector type on invariant
9420 : : arguments. */
9421 : : unsigned j;
9422 : : slp_tree child;
9423 : 276291 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9424 : 184194 : if (!vect_maybe_update_slp_op_vectype
9425 : 184194 : (child, SLP_TREE_VECTYPE (slp_node)))
9426 : : {
9427 : 0 : if (dump_enabled_p ())
9428 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9429 : : "incompatible vector types for "
9430 : : "invariants\n");
9431 : 0 : return false;
9432 : : }
9433 : : /* loop cost for vec_loop. */
9434 : 92097 : inside_cost = record_stmt_cost (cost_vec, nvects,
9435 : : vector_stmt, slp_node, 0, vect_body);
9436 : : /* prologue cost for vec_init (if not nested) and step. */
9437 : 92097 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9438 : : scalar_to_vec,
9439 : : slp_node, 0, vect_prologue);
9440 : 92097 : if (dump_enabled_p ())
9441 : 3987 : dump_printf_loc (MSG_NOTE, vect_location,
9442 : : "vect_model_induction_cost: inside_cost = %d, "
9443 : : "prologue_cost = %d .\n", inside_cost,
9444 : : prologue_cost);
9445 : :
9446 : 92097 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9447 : 92097 : DUMP_VECT_SCOPE ("vectorizable_induction");
9448 : 92097 : return true;
9449 : : }
9450 : :
9451 : : /* Transform. */
9452 : :
9453 : : /* Compute a vector variable, initialized with the first VF values of
9454 : : the induction variable. E.g., for an iv with IV_PHI='X' and
9455 : : evolution S, for a vector of 4 units, we want to compute:
9456 : : [X, X + S, X + 2*S, X + 3*S]. */
9457 : :
9458 : 16470 : if (dump_enabled_p ())
9459 : 2818 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9460 : :
9461 : 16470 : pe = loop_preheader_edge (iv_loop);
9462 : : /* Find the first insertion point in the BB. */
9463 : 16470 : basic_block bb = gimple_bb (phi);
9464 : 16470 : si = gsi_after_labels (bb);
9465 : :
9466 : : /* For SLP induction we have to generate several IVs as for example
9467 : : with group size 3 we need
9468 : : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9469 : : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9470 : 16470 : gimple_stmt_iterator incr_si;
9471 : 16470 : bool insert_after;
9472 : 16470 : standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
9473 : :
9474 : : /* The initial values are vectorized, but any lanes > group_size
9475 : : need adjustment. */
9476 : 16470 : slp_tree init_node
9477 : 16470 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9478 : :
9479 : : /* Gather steps. Since we do not vectorize inductions as
9480 : : cycles we have to reconstruct the step from SCEV data. */
9481 : 16470 : unsigned group_size = SLP_TREE_LANES (slp_node);
9482 : 16470 : tree *steps = XALLOCAVEC (tree, group_size);
9483 : 16470 : tree *inits = XALLOCAVEC (tree, group_size);
9484 : 16470 : stmt_vec_info phi_info;
9485 : 50650 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9486 : : {
9487 : 17710 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9488 : 17710 : if (!init_node)
9489 : 17534 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9490 : : pe->dest_idx);
9491 : : }
9492 : :
9493 : : /* Now generate the IVs. */
9494 : 32940 : gcc_assert (multiple_p (nunits * nvects, group_size));
9495 : 16470 : unsigned nivs;
9496 : 16470 : unsigned HOST_WIDE_INT const_nunits;
9497 : 16470 : if (nested_in_vect_loop)
9498 : : nivs = nvects;
9499 : 16308 : else if (nunits.is_constant (&const_nunits))
9500 : : {
9501 : : /* Compute the number of distinct IVs we need. First reduce
9502 : : group_size if it is a multiple of const_nunits so we get
9503 : : one IV for a group_size of 4 but const_nunits 2. */
9504 : 16308 : unsigned group_sizep = group_size;
9505 : 16308 : if (group_sizep % const_nunits == 0)
9506 : 109 : group_sizep = group_sizep / const_nunits;
9507 : 16308 : nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
9508 : : }
9509 : : else
9510 : : {
9511 : : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9512 : : nivs = 1;
9513 : : }
9514 : 16470 : gimple_seq init_stmts = NULL;
9515 : 16470 : tree lupdate_mul = NULL_TREE;
9516 : 162 : if (!nested_in_vect_loop)
9517 : : {
9518 : 16308 : if (nunits.is_constant (&const_nunits))
9519 : : {
9520 : : /* The number of iterations covered in one vector iteration. */
9521 : 16308 : unsigned lup_mul = (nvects * const_nunits) / group_size;
9522 : 16308 : lupdate_mul
9523 : 16308 : = build_vector_from_val (step_vectype,
9524 : 16308 : SCALAR_FLOAT_TYPE_P (stept)
9525 : 27 : ? build_real_from_wide (stept, lup_mul,
9526 : : UNSIGNED)
9527 : 32589 : : build_int_cstu (stept, lup_mul));
9528 : : }
9529 : : else
9530 : : {
9531 : : if (SCALAR_FLOAT_TYPE_P (stept))
9532 : : {
9533 : : tree tem = build_int_cst (integer_type_node, vf);
9534 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9535 : : }
9536 : : else
9537 : : lupdate_mul = build_int_cst (stept, vf);
9538 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9539 : : lupdate_mul);
9540 : : }
9541 : : }
9542 : 16470 : tree peel_mul = NULL_TREE;
9543 : 16470 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9544 : : {
9545 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9546 : 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9547 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9548 : : else
9549 : 0 : peel_mul = gimple_convert (&init_stmts, stept,
9550 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9551 : 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
9552 : : step_vectype, peel_mul);
9553 : :
9554 : : /* If early break then we have to create a new PHI which we can use as
9555 : : an offset to adjust the induction reduction in early exits.
9556 : :
9557 : : This is because when peeling for alignment using masking, the first
9558 : : few elements of the vector can be inactive. As such if we find the
9559 : : entry in the first iteration we have adjust the starting point of
9560 : : the scalar code.
9561 : :
9562 : : We do this by creating a new scalar PHI that keeps track of whether
9563 : : we are the first iteration of the loop (with the additional masking)
9564 : : or whether we have taken a loop iteration already.
9565 : :
9566 : : The generated sequence:
9567 : :
9568 : : pre-header:
9569 : : bb1:
9570 : : i_1 = <number of leading inactive elements>
9571 : :
9572 : : header:
9573 : : bb2:
9574 : : i_2 = PHI <i_1(bb1), 0(latch)>
9575 : : …
9576 : :
9577 : : early-exit:
9578 : : bb3:
9579 : : i_3 = iv_step * i_2 + PHI<vector-iv>
9580 : :
9581 : : The first part of the adjustment to create i_1 and i_2 are done here
9582 : : and the last part creating i_3 is done in
9583 : : vectorizable_live_operations when the induction extraction is
9584 : : materialized. */
9585 : 0 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
9586 : 0 : && !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
9587 : : {
9588 : 0 : auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9589 : 0 : tree ty_skip_niters = TREE_TYPE (skip_niters);
9590 : 0 : tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters,
9591 : : vect_scalar_var,
9592 : : "pfa_iv_offset");
9593 : 0 : gphi *nphi = create_phi_node (break_lhs_phi, bb);
9594 : 0 : add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION);
9595 : 0 : add_phi_arg (nphi, build_zero_cst (ty_skip_niters),
9596 : : loop_latch_edge (iv_loop), UNKNOWN_LOCATION);
9597 : :
9598 : 0 : LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo) = PHI_RESULT (nphi);
9599 : : }
9600 : : }
9601 : 16470 : tree step_mul = NULL_TREE;
9602 : 16470 : unsigned ivn;
9603 : 16470 : auto_vec<tree> vec_steps;
9604 : 33510 : for (ivn = 0; ivn < nivs; ++ivn)
9605 : : {
9606 : 17040 : gimple_seq stmts = NULL;
9607 : 17040 : bool invariant = true;
9608 : 17040 : if (nunits.is_constant (&const_nunits))
9609 : : {
9610 : 17040 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9611 : 17040 : tree_vector_builder init_elts (vectype, const_nunits, 1);
9612 : 17040 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9613 : 112614 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9614 : : {
9615 : : /* The scalar steps of the IVs. */
9616 : 95574 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9617 : 95574 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9618 : 95574 : step_elts.quick_push (elt);
9619 : 95574 : if (!init_node)
9620 : : {
9621 : : /* The scalar inits of the IVs if not vectorized. */
9622 : 94612 : elt = inits[(ivn*const_nunits + eltn) % group_size];
9623 : 94612 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
9624 : 94612 : TREE_TYPE (elt)))
9625 : 286 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9626 : 286 : TREE_TYPE (vectype), elt);
9627 : 94612 : init_elts.quick_push (elt);
9628 : : }
9629 : : /* The number of steps to add to the initial values. */
9630 : 95574 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9631 : 191148 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9632 : 191050 : ? build_real_from_wide (stept, mul_elt,
9633 : : UNSIGNED)
9634 : 191050 : : build_int_cstu (stept, mul_elt));
9635 : : }
9636 : 17040 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
9637 : 17040 : step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9638 : 17040 : if (!init_node)
9639 : 16846 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
9640 : 17040 : }
9641 : : else
9642 : : {
9643 : : if (init_node)
9644 : : ;
9645 : : else if (INTEGRAL_TYPE_P (TREE_TYPE (steps[0])))
9646 : : {
9647 : : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9648 : : /* Build the initial value directly as a VEC_SERIES_EXPR. */
9649 : : vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
9650 : : step_vectype, new_name, steps[0]);
9651 : : if (!useless_type_conversion_p (vectype, step_vectype))
9652 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9653 : : vectype, vec_init);
9654 : : }
9655 : : else
9656 : : {
9657 : : /* Build:
9658 : : [base, base, base, ...]
9659 : : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9660 : : gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (steps[0])));
9661 : : gcc_assert (flag_associative_math);
9662 : : gcc_assert (index_vectype != NULL_TREE);
9663 : :
9664 : : tree index = build_index_vector (index_vectype, 0, 1);
9665 : : new_name = gimple_convert (&init_stmts, TREE_TYPE (steps[0]),
9666 : : inits[0]);
9667 : : tree base_vec = gimple_build_vector_from_val (&init_stmts,
9668 : : step_vectype,
9669 : : new_name);
9670 : : tree step_vec = gimple_build_vector_from_val (&init_stmts,
9671 : : step_vectype,
9672 : : steps[0]);
9673 : : vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
9674 : : step_vectype, index);
9675 : : vec_init = gimple_build (&init_stmts, MULT_EXPR,
9676 : : step_vectype, vec_init, step_vec);
9677 : : vec_init = gimple_build (&init_stmts, PLUS_EXPR,
9678 : : step_vectype, vec_init, base_vec);
9679 : : if (!useless_type_conversion_p (vectype, step_vectype))
9680 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9681 : : vectype, vec_init);
9682 : : }
9683 : : /* iv_loop is nested in the loop to be vectorized. Generate:
9684 : : vec_step = [S, S, S, S] */
9685 : : t = unshare_expr (steps[0]);
9686 : : gcc_assert (CONSTANT_CLASS_P (t)
9687 : : || TREE_CODE (t) == SSA_NAME);
9688 : : vec_step = gimple_build_vector_from_val (&init_stmts,
9689 : : step_vectype, t);
9690 : : }
9691 : 17040 : vec_steps.safe_push (vec_step);
9692 : 17040 : if (peel_mul)
9693 : : {
9694 : 0 : if (!step_mul)
9695 : : {
9696 : 0 : gcc_assert (!nunits.is_constant ());
9697 : : step_mul = gimple_build (&init_stmts,
9698 : : MINUS_EXPR, step_vectype,
9699 : : build_zero_cst (step_vectype), peel_mul);
9700 : : }
9701 : : else
9702 : 0 : step_mul = gimple_build (&init_stmts,
9703 : : MINUS_EXPR, step_vectype,
9704 : : step_mul, peel_mul);
9705 : : }
9706 : :
9707 : : /* Create the induction-phi that defines the induction-operand. */
9708 : 17040 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9709 : : "vec_iv_");
9710 : 17040 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9711 : 17040 : induc_def = PHI_RESULT (induction_phi);
9712 : :
9713 : : /* Create the iv update inside the loop */
9714 : 17040 : tree up = vec_step;
9715 : 17040 : if (lupdate_mul)
9716 : : {
9717 : 16846 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
9718 : : {
9719 : : /* When we're using loop_len produced by SELEC_VL, the
9720 : : non-final iterations are not always processing VF
9721 : : elements. So vectorize induction variable instead of
9722 : :
9723 : : _21 = vect_vec_iv_.6_22 + { VF, ... };
9724 : :
9725 : : We should generate:
9726 : :
9727 : : _35 = .SELECT_VL (ivtmp_33, VF);
9728 : : vect_cst__22 = [vec_duplicate_expr] _35;
9729 : : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
9730 : 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
9731 : 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
9732 : : vectype, 0, 0);
9733 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9734 : 0 : expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
9735 : : else
9736 : 0 : expr = gimple_convert (&stmts, stept, len);
9737 : 0 : lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
9738 : : expr);
9739 : 0 : up = gimple_build (&stmts, MULT_EXPR,
9740 : : step_vectype, vec_step, lupdate_mul);
9741 : : }
9742 : : else
9743 : 16846 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9744 : : vec_step, lupdate_mul);
9745 : : }
9746 : 17040 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9747 : 17040 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
9748 : 17040 : vec_def = gimple_convert (&stmts, vectype, vec_def);
9749 : 17040 : insert_iv_increment (&incr_si, insert_after, stmts);
9750 : 17040 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9751 : : UNKNOWN_LOCATION);
9752 : :
9753 : 17040 : if (init_node)
9754 : 194 : vec_init = vect_get_slp_vect_def (init_node, ivn);
9755 : 17040 : if (!nested_in_vect_loop
9756 : 17040 : && step_mul
9757 : 17040 : && !integer_zerop (step_mul))
9758 : : {
9759 : 16408 : gcc_assert (invariant);
9760 : 16408 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9761 : 16408 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9762 : : vec_step, step_mul);
9763 : 16408 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9764 : : vec_def, up);
9765 : 16408 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9766 : : }
9767 : :
9768 : : /* Set the arguments of the phi node: */
9769 : 17040 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9770 : :
9771 : 17040 : slp_node->push_vec_def (induction_phi);
9772 : : }
9773 : 16470 : if (!nested_in_vect_loop)
9774 : : {
9775 : : /* Fill up to the number of vectors we need for the whole group. */
9776 : 16308 : if (nunits.is_constant (&const_nunits))
9777 : 16308 : nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
9778 : : else
9779 : : nivs = 1;
9780 : 16308 : vec_steps.reserve (nivs-ivn);
9781 : 32643 : for (; ivn < nivs; ++ivn)
9782 : : {
9783 : 27 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9784 : 27 : vec_steps.quick_push (vec_steps[0]);
9785 : : }
9786 : : }
9787 : :
9788 : : /* Re-use IVs when we can. We are generating further vector
9789 : : stmts by adding VF' * stride to the IVs generated above. */
9790 : 16470 : if (ivn < nvects)
9791 : : {
9792 : 3904 : if (nunits.is_constant (&const_nunits))
9793 : : {
9794 : 3904 : unsigned vfp = (least_common_multiple (group_size, const_nunits)
9795 : 3904 : / group_size);
9796 : 3904 : lupdate_mul
9797 : 3904 : = build_vector_from_val (step_vectype,
9798 : 3904 : SCALAR_FLOAT_TYPE_P (stept)
9799 : 8 : ? build_real_from_wide (stept,
9800 : 8 : vfp, UNSIGNED)
9801 : 7800 : : build_int_cstu (stept, vfp));
9802 : : }
9803 : : else
9804 : : {
9805 : : if (SCALAR_FLOAT_TYPE_P (stept))
9806 : : {
9807 : : tree tem = build_int_cst (integer_type_node, nunits);
9808 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9809 : : }
9810 : : else
9811 : : lupdate_mul = build_int_cst (stept, nunits);
9812 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9813 : : lupdate_mul);
9814 : : }
9815 : 12472 : for (; ivn < nvects; ++ivn)
9816 : : {
9817 : 8568 : gimple *iv
9818 : 8568 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
9819 : 8568 : tree def = gimple_get_lhs (iv);
9820 : 8568 : if (ivn < 2*nivs)
9821 : 4006 : vec_steps[ivn - nivs]
9822 : 4006 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9823 : 4006 : vec_steps[ivn - nivs], lupdate_mul);
9824 : 8568 : gimple_seq stmts = NULL;
9825 : 8568 : def = gimple_convert (&stmts, step_vectype, def);
9826 : 25704 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9827 : 8568 : def, vec_steps[ivn % nivs]);
9828 : 8568 : def = gimple_convert (&stmts, vectype, def);
9829 : 8568 : if (gimple_code (iv) == GIMPLE_PHI)
9830 : 4006 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9831 : : else
9832 : : {
9833 : 4562 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9834 : 4562 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9835 : : }
9836 : 8568 : slp_node->push_vec_def (def);
9837 : : }
9838 : : }
9839 : :
9840 : 16470 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9841 : 16470 : gcc_assert (!new_bb);
9842 : :
9843 : 16470 : return true;
9844 : 16470 : }
9845 : :
9846 : : /* Function vectorizable_live_operation_1.
9847 : :
9848 : : helper function for vectorizable_live_operation. */
9849 : :
9850 : : static tree
9851 : 5258 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo, basic_block exit_bb,
9852 : : tree vectype, slp_tree slp_node,
9853 : : tree bitsize, tree bitstart, tree vec_lhs,
9854 : : tree lhs_type, gimple_stmt_iterator *exit_gsi)
9855 : : {
9856 : 5258 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
9857 : :
9858 : 5258 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
9859 : 5258 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
9860 : 10909 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
9861 : 5651 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
9862 : :
9863 : 5258 : gimple_seq stmts = NULL;
9864 : 5258 : tree new_tree;
9865 : :
9866 : : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
9867 : 5258 : if (integer_zerop (bitstart))
9868 : : {
9869 : 2721 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
9870 : : vec_lhs_phi, bitsize, bitstart);
9871 : :
9872 : : /* Convert the extracted vector element to the scalar type. */
9873 : 2721 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
9874 : : }
9875 : 2537 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9876 : : {
9877 : : /* Emit:
9878 : :
9879 : : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - (BIAS + 1)>
9880 : :
9881 : : where VEC_LHS is the vectorized live-out result, LEN is the length of
9882 : : the vector, BIAS is the load-store bias. The bias should not be used
9883 : : at all since we are not using load/store operations, but LEN will be
9884 : : REALLEN + BIAS, so subtract it to get to the correct position. */
9885 : 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9886 : 0 : gimple_seq tem = NULL;
9887 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
9888 : 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
9889 : : &LOOP_VINFO_LENS (loop_vinfo),
9890 : : 1, vectype, 0, 1);
9891 : 0 : gimple_seq_add_seq (&stmts, tem);
9892 : :
9893 : : /* BIAS + 1. */
9894 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
9895 : 0 : tree bias_plus_one
9896 : 0 : = int_const_binop (PLUS_EXPR,
9897 : 0 : build_int_cst (TREE_TYPE (len), biasval),
9898 : 0 : build_one_cst (TREE_TYPE (len)));
9899 : :
9900 : : /* LAST_INDEX = LEN - (BIAS + 1). */
9901 : 0 : tree last_index = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len),
9902 : : len, bias_plus_one);
9903 : :
9904 : : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - (BIAS + 1)>. */
9905 : 0 : tree scalar_res
9906 : 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
9907 : : vec_lhs_phi, last_index);
9908 : :
9909 : : /* Convert the extracted vector element to the scalar type. */
9910 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
9911 : : }
9912 : 2537 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9913 : : {
9914 : : /* Emit:
9915 : :
9916 : : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
9917 : :
9918 : : where VEC_LHS is the vectorized live-out result and MASK is
9919 : : the loop mask for the final iteration. */
9920 : 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9921 : 0 : tree scalar_type = TREE_TYPE (vectype);
9922 : 0 : gimple_seq tem = NULL;
9923 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
9924 : 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
9925 : : &LOOP_VINFO_MASKS (loop_vinfo),
9926 : : 1, vectype, 0);
9927 : 0 : tree scalar_res;
9928 : 0 : gimple_seq_add_seq (&stmts, tem);
9929 : :
9930 : 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
9931 : : mask, vec_lhs_phi);
9932 : :
9933 : : /* Convert the extracted vector element to the scalar type. */
9934 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
9935 : : }
9936 : : else
9937 : : {
9938 : 2537 : tree bftype = TREE_TYPE (vectype);
9939 : 2537 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
9940 : 87 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
9941 : 2537 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
9942 : 2537 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
9943 : : &stmts, true, NULL_TREE);
9944 : : }
9945 : :
9946 : 5258 : *exit_gsi = gsi_after_labels (exit_bb);
9947 : 5258 : if (stmts)
9948 : 5258 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
9949 : :
9950 : 5258 : return new_tree;
9951 : : }
9952 : :
9953 : : /* Function vectorizable_live_operation.
9954 : :
9955 : : STMT_INFO computes a value that is used outside the loop. Check if
9956 : : it can be supported. */
9957 : :
9958 : : bool
9959 : 274660 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
9960 : : slp_tree slp_node, slp_instance slp_node_instance,
9961 : : int slp_index, bool vec_stmt_p,
9962 : : stmt_vector_for_cost *cost_vec)
9963 : : {
9964 : 274660 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
9965 : 274660 : imm_use_iterator imm_iter;
9966 : 274660 : tree lhs, lhs_type, bitsize;
9967 : 274660 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9968 : 274660 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9969 : 274660 : gimple *use_stmt;
9970 : 274660 : use_operand_p use_p;
9971 : 274660 : auto_vec<tree> vec_oprnds;
9972 : 274660 : int vec_entry = 0;
9973 : 274660 : poly_uint64 vec_index = 0;
9974 : :
9975 : 274660 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
9976 : : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
9977 : :
9978 : : /* If a stmt of a reduction is live, vectorize it via
9979 : : vect_create_epilog_for_reduction. vectorizable_reduction assessed
9980 : : validity so just trigger the transform here. */
9981 : 274660 : if (vect_is_reduction (slp_node))
9982 : : {
9983 : 52972 : if (!vec_stmt_p)
9984 : : return true;
9985 : : /* For SLP reductions we vectorize the epilogue for all involved stmts
9986 : : together. For SLP reduction chains we only get here once. */
9987 : 22519 : if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
9988 : 20397 : && slp_index != 0)
9989 : : return true;
9990 : 22211 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
9991 : 22211 : if (VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
9992 : 22211 : || VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
9993 : : return true;
9994 : :
9995 : 21371 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
9996 : 21371 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
9997 : 21367 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
9998 : : slp_node_instance,
9999 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10000 : :
10001 : : /* If early break we only have to materialize the reduction on the merge
10002 : : block, but we have to find an alternate exit first. */
10003 : 21371 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10004 : : {
10005 : 23 : slp_tree phis_node = slp_node_instance->reduc_phis;
10006 : 23 : stmt_info = SLP_TREE_REPRESENTATIVE (phis_node);
10007 : 69 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10008 : 23 : if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10009 : : {
10010 : 23 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10011 : : phis_node, slp_node_instance,
10012 : : exit);
10013 : 23 : break;
10014 : 23 : }
10015 : 23 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10016 : 4 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10017 : : phis_node, slp_node_instance,
10018 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10019 : : }
10020 : :
10021 : 21371 : return true;
10022 : : }
10023 : :
10024 : : /* If STMT is not relevant and it is a simple assignment and its inputs are
10025 : : invariant then it can remain in place, unvectorized. The original last
10026 : : scalar value that it computes will be used. */
10027 : 221688 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10028 : : {
10029 : 0 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10030 : 0 : if (dump_enabled_p ())
10031 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
10032 : : "statement is simple and uses invariant. Leaving in "
10033 : : "place.\n");
10034 : 0 : return true;
10035 : : }
10036 : :
10037 : 221688 : gcc_assert (slp_index >= 0);
10038 : :
10039 : : /* Get the last occurrence of the scalar index from the concatenation of
10040 : : all the slp vectors. Calculate which slp vector it is and the index
10041 : : within. */
10042 : 221688 : int num_scalar = SLP_TREE_LANES (slp_node);
10043 : 221688 : int num_vec = vect_get_num_copies (vinfo, slp_node);
10044 : 221688 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10045 : :
10046 : : /* Calculate which vector contains the result, and which lane of
10047 : : that vector we need. */
10048 : 221688 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10049 : : {
10050 : : if (dump_enabled_p ())
10051 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10052 : : "Cannot determine which vector holds the"
10053 : : " final result.\n");
10054 : : return false;
10055 : : }
10056 : :
10057 : 221688 : if (!vec_stmt_p)
10058 : : {
10059 : : /* No transformation required. */
10060 : 179830 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10061 : : {
10062 : 1 : if (SLP_TREE_LANES (slp_node) != 1)
10063 : : {
10064 : 0 : if (dump_enabled_p ())
10065 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10066 : : "can't operate on partial vectors "
10067 : : "because an SLP statement is live after "
10068 : : "the loop.\n");
10069 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10070 : : }
10071 : 1 : else if (num_vec > 1)
10072 : : {
10073 : 0 : if (dump_enabled_p ())
10074 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10075 : : "can't operate on partial vectors "
10076 : : "because ncopies is greater than 1.\n");
10077 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10078 : : }
10079 : : else
10080 : : {
10081 : 1 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10082 : : OPTIMIZE_FOR_SPEED))
10083 : 0 : vect_record_loop_mask (loop_vinfo,
10084 : : &LOOP_VINFO_MASKS (loop_vinfo),
10085 : : 1, vectype, NULL);
10086 : 1 : else if (can_vec_extract_var_idx_p (
10087 : 1 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10088 : 0 : vect_record_loop_len (loop_vinfo,
10089 : : &LOOP_VINFO_LENS (loop_vinfo),
10090 : : 1, vectype, 1);
10091 : : else
10092 : : {
10093 : 1 : if (dump_enabled_p ())
10094 : 0 : dump_printf_loc (
10095 : 0 : MSG_MISSED_OPTIMIZATION, vect_location,
10096 : : "can't operate on partial vectors "
10097 : : "because the target doesn't support extract "
10098 : : "last reduction.\n");
10099 : 1 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10100 : : }
10101 : : }
10102 : : }
10103 : : /* ??? Enable for loop costing as well. */
10104 : 1 : if (!loop_vinfo)
10105 : 90089 : record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
10106 : : 0, vect_epilogue);
10107 : 179830 : return true;
10108 : : }
10109 : :
10110 : : /* Use the lhs of the original scalar statement. */
10111 : 41858 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10112 : 41858 : if (dump_enabled_p ())
10113 : 1472 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10114 : : "stmt %G", stmt);
10115 : :
10116 : 41858 : lhs = gimple_get_lhs (stmt);
10117 : 41858 : lhs_type = TREE_TYPE (lhs);
10118 : :
10119 : 41858 : bitsize = vector_element_bits_tree (vectype);
10120 : :
10121 : : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10122 : 41858 : gcc_assert (!loop_vinfo
10123 : : || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10124 : : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10125 : : || SLP_TREE_LANES (slp_node) == 1));
10126 : :
10127 : : /* Get the correct slp vectorized stmt. */
10128 : 41858 : tree vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10129 : 41858 : gimple *vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10130 : :
10131 : : /* In case we need to early break vectorize also get the first stmt. */
10132 : 41858 : tree vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10133 : :
10134 : : /* Get entry to use. */
10135 : 41858 : tree bitstart = bitsize_int (vec_index);
10136 : 41858 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10137 : :
10138 : 41858 : if (loop_vinfo)
10139 : : {
10140 : : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10141 : : requirement, insert one phi node for it. It looks like:
10142 : : loop;
10143 : : BB:
10144 : : # lhs' = PHI <lhs>
10145 : : ==>
10146 : : loop;
10147 : : BB:
10148 : : # vec_lhs' = PHI <vec_lhs>
10149 : : new_tree = lane_extract <vec_lhs', ...>;
10150 : : lhs' = new_tree; */
10151 : :
10152 : 5313 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10153 : : /* Check if we have a loop where the chosen exit is not the main exit,
10154 : : in these cases for an early break we restart the iteration the vector code
10155 : : did. For the live values we want the value at the start of the iteration
10156 : : rather than at the end. */
10157 : 5313 : edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10158 : 5313 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10159 : 22551 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10160 : 17238 : if (!is_gimple_debug (use_stmt)
10161 : 17238 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10162 : 5258 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10163 : : {
10164 : 5258 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10165 : 5258 : phi_arg_index_from_use (use_p));
10166 : 5258 : gcc_assert (loop_exit_edge_p (loop, e));
10167 : 5258 : bool main_exit_edge = e == main_e;
10168 : 5258 : tree tmp_vec_lhs = vec_lhs;
10169 : 5258 : tree tmp_bitstart = bitstart;
10170 : :
10171 : : /* For early exit where the exit is not in the BB that leads
10172 : : to the latch then we're restarting the iteration in the
10173 : : scalar loop. So get the first live value. */
10174 : 13211 : bool early_break_first_element_p
10175 : 5258 : = (all_exits_as_early_p || !main_exit_edge)
10176 : 5258 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def;
10177 : 2695 : if (early_break_first_element_p)
10178 : : {
10179 : 2695 : tmp_vec_lhs = vec_lhs0;
10180 : 2695 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10181 : : }
10182 : :
10183 : 5258 : gimple_stmt_iterator exit_gsi;
10184 : 5258 : tree new_tree
10185 : 5258 : = vectorizable_live_operation_1 (loop_vinfo,
10186 : : e->dest, vectype,
10187 : : slp_node, bitsize,
10188 : : tmp_bitstart, tmp_vec_lhs,
10189 : : lhs_type, &exit_gsi);
10190 : :
10191 : 5258 : auto gsi = gsi_for_stmt (use_stmt);
10192 : 5258 : if (early_break_first_element_p
10193 : 2695 : && LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
10194 : : {
10195 : 0 : tree step_expr
10196 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10197 : 0 : tree break_lhs_phi
10198 : : = LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo);
10199 : 0 : tree ty_skip_niters = TREE_TYPE (break_lhs_phi);
10200 : 0 : gimple_seq iv_stmts = NULL;
10201 : :
10202 : : /* Now create the PHI for the outside loop usage to
10203 : : retrieve the value for the offset counter. */
10204 : 0 : tree rphi_step
10205 : 0 : = gimple_convert (&iv_stmts, ty_skip_niters, step_expr);
10206 : 0 : tree tmp2
10207 : 0 : = gimple_build (&iv_stmts, MULT_EXPR,
10208 : : ty_skip_niters, rphi_step,
10209 : : break_lhs_phi);
10210 : :
10211 : 0 : if (POINTER_TYPE_P (TREE_TYPE (new_tree)))
10212 : : {
10213 : 0 : tmp2 = gimple_convert (&iv_stmts, sizetype, tmp2);
10214 : 0 : tmp2 = gimple_build (&iv_stmts, POINTER_PLUS_EXPR,
10215 : 0 : TREE_TYPE (new_tree), new_tree,
10216 : : tmp2);
10217 : : }
10218 : : else
10219 : : {
10220 : 0 : tmp2 = gimple_convert (&iv_stmts, TREE_TYPE (new_tree),
10221 : : tmp2);
10222 : 0 : tmp2 = gimple_build (&iv_stmts, PLUS_EXPR,
10223 : 0 : TREE_TYPE (new_tree), new_tree,
10224 : : tmp2);
10225 : : }
10226 : :
10227 : 0 : new_tree = tmp2;
10228 : 0 : gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
10229 : : }
10230 : :
10231 : 5258 : tree lhs_phi = gimple_phi_result (use_stmt);
10232 : 5258 : remove_phi_node (&gsi, false);
10233 : 5258 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10234 : 5258 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10235 : 5258 : break;
10236 : 5313 : }
10237 : :
10238 : : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10239 : 17293 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10240 : 11980 : gcc_assert (is_gimple_debug (use_stmt)
10241 : 5313 : || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10242 : : }
10243 : : else
10244 : : {
10245 : : /* For basic-block vectorization simply insert the lane-extraction. */
10246 : 36545 : tree bftype = TREE_TYPE (vectype);
10247 : 36545 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10248 : 0 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10249 : 36545 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
10250 : : vec_lhs, bitsize, bitstart);
10251 : 36545 : gimple_seq stmts = NULL;
10252 : 36545 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10253 : : &stmts, true, NULL_TREE);
10254 : 36545 : if (TREE_CODE (new_tree) == SSA_NAME
10255 : 73090 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10256 : 2 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10257 : 36545 : if (is_a <gphi *> (vec_stmt))
10258 : : {
10259 : 2896 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10260 : 2896 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10261 : : }
10262 : : else
10263 : : {
10264 : 33649 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10265 : 33649 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10266 : : }
10267 : :
10268 : : /* Replace use of lhs with newly computed result. If the use stmt is a
10269 : : single arg PHI, just replace all uses of PHI result. It's necessary
10270 : : because lcssa PHI defining lhs may be before newly inserted stmt. */
10271 : 36545 : use_operand_p use_p;
10272 : 36545 : stmt_vec_info use_stmt_info;
10273 : 198618 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10274 : 162073 : if (!is_gimple_debug (use_stmt)
10275 : 162073 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10276 : 109707 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10277 : : {
10278 : : /* ??? This can happen when the live lane ends up being
10279 : : rooted in a vector construction code-generated by an
10280 : : external SLP node (and code-generation for that already
10281 : : happened). See gcc.dg/vect/bb-slp-47.c.
10282 : : Doing this is what would happen if that vector CTOR
10283 : : were not code-generated yet so it is not too bad.
10284 : : ??? In fact we'd likely want to avoid this situation
10285 : : in the first place. */
10286 : 63596 : if (TREE_CODE (new_tree) == SSA_NAME
10287 : 63332 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10288 : 63332 : && gimple_code (use_stmt) != GIMPLE_PHI
10289 : 119008 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10290 : : use_stmt))
10291 : : {
10292 : 264 : if (dump_enabled_p ())
10293 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10294 : : "Using original scalar computation for "
10295 : : "live lane because use preceeds vector "
10296 : : "def\n");
10297 : 264 : continue;
10298 : : }
10299 : : /* ??? It can also happen that we end up pulling a def into
10300 : : a loop where replacing out-of-loop uses would require
10301 : : a new LC SSA PHI node. Retain the original scalar in
10302 : : those cases as well. PR98064. */
10303 : 64570 : if (TREE_CODE (new_tree) == SSA_NAME
10304 : 63068 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10305 : 63068 : && (gimple_bb (use_stmt)->loop_father
10306 : 63068 : != gimple_bb (vec_stmt)->loop_father)
10307 : 69974 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10308 : 6906 : gimple_bb (use_stmt)->loop_father))
10309 : : {
10310 : 1502 : if (dump_enabled_p ())
10311 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10312 : : "Using original scalar computation for "
10313 : : "live lane because there is an out-of-loop "
10314 : : "definition for it\n");
10315 : 1502 : continue;
10316 : : }
10317 : 186604 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10318 : 62519 : SET_USE (use_p, new_tree);
10319 : 61566 : update_stmt (use_stmt);
10320 : 36545 : }
10321 : : }
10322 : :
10323 : : return true;
10324 : 274660 : }
10325 : :
10326 : : /* Given loop represented by LOOP_VINFO, return true if computation of
10327 : : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10328 : : otherwise. */
10329 : :
10330 : : static bool
10331 : 59211 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
10332 : : {
10333 : : /* Constant case. */
10334 : 59211 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10335 : : {
10336 : 34375 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10337 : 34375 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10338 : :
10339 : 34375 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10340 : 34375 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10341 : 34375 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10342 : : return true;
10343 : : }
10344 : :
10345 : 24836 : widest_int max;
10346 : 24836 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10347 : : /* Check the upper bound of loop niters. */
10348 : 24836 : if (get_max_loop_iterations (loop, &max))
10349 : : {
10350 : 24836 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10351 : 24836 : signop sgn = TYPE_SIGN (type);
10352 : 24836 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10353 : 24836 : if (max < type_max)
10354 : 24625 : return true;
10355 : 24836 : }
10356 : : return false;
10357 : 24836 : }
10358 : :
10359 : : /* Return a mask type with half the number of elements as OLD_TYPE,
10360 : : given that it should have mode NEW_MODE. */
10361 : :
10362 : : tree
10363 : 3872 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10364 : : {
10365 : 3872 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10366 : 3872 : return build_truth_vector_type_for_mode (nunits, new_mode);
10367 : : }
10368 : :
10369 : : /* Return a mask type with twice as many elements as OLD_TYPE,
10370 : : given that it should have mode NEW_MODE. */
10371 : :
10372 : : tree
10373 : 5627 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10374 : : {
10375 : 5627 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10376 : 5627 : return build_truth_vector_type_for_mode (nunits, new_mode);
10377 : : }
10378 : :
10379 : : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10380 : : contain a sequence of NVECTORS masks that each control a vector of type
10381 : : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10382 : : these vector masks with the vector version of SCALAR_MASK. */
10383 : :
10384 : : void
10385 : 128 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10386 : : unsigned int nvectors, tree vectype, tree scalar_mask)
10387 : : {
10388 : 128 : gcc_assert (nvectors != 0);
10389 : :
10390 : 128 : if (scalar_mask)
10391 : : {
10392 : 24 : scalar_cond_masked_key cond (scalar_mask, nvectors);
10393 : 24 : loop_vinfo->scalar_cond_masked_set.add (cond);
10394 : : }
10395 : :
10396 : 128 : masks->mask_set.add (std::make_pair (vectype, nvectors));
10397 : 128 : }
10398 : :
10399 : : /* Given a complete set of masks MASKS, extract mask number INDEX
10400 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10401 : : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10402 : :
10403 : : See the comment above vec_loop_masks for more details about the mask
10404 : : arrangement. */
10405 : :
10406 : : tree
10407 : 166 : vect_get_loop_mask (loop_vec_info loop_vinfo,
10408 : : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10409 : : unsigned int nvectors, tree vectype, unsigned int index)
10410 : : {
10411 : 166 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10412 : : == vect_partial_vectors_while_ult)
10413 : : {
10414 : 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10415 : 0 : tree mask_type = rgm->type;
10416 : :
10417 : : /* Populate the rgroup's mask array, if this is the first time we've
10418 : : used it. */
10419 : 0 : if (rgm->controls.is_empty ())
10420 : : {
10421 : 0 : rgm->controls.safe_grow_cleared (nvectors, true);
10422 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
10423 : : {
10424 : 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10425 : : /* Provide a dummy definition until the real one is available. */
10426 : 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10427 : 0 : rgm->controls[i] = mask;
10428 : : }
10429 : : }
10430 : :
10431 : 0 : tree mask = rgm->controls[index];
10432 : 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10433 : 0 : TYPE_VECTOR_SUBPARTS (vectype)))
10434 : : {
10435 : : /* A loop mask for data type X can be reused for data type Y
10436 : : if X has N times more elements than Y and if Y's elements
10437 : : are N times bigger than X's. In this case each sequence
10438 : : of N elements in the loop mask will be all-zero or all-one.
10439 : : We can then view-convert the mask so that each sequence of
10440 : : N elements is replaced by a single element. */
10441 : 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10442 : : TYPE_VECTOR_SUBPARTS (vectype)));
10443 : 0 : gimple_seq seq = NULL;
10444 : 0 : mask_type = truth_type_for (vectype);
10445 : 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10446 : 0 : if (seq)
10447 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10448 : : }
10449 : 0 : return mask;
10450 : : }
10451 : 166 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10452 : : == vect_partial_vectors_avx512)
10453 : : {
10454 : : /* The number of scalars per iteration and the number of vectors are
10455 : : both compile-time constants. */
10456 : 166 : unsigned int nscalars_per_iter
10457 : 166 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10458 : 166 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10459 : :
10460 : 166 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10461 : :
10462 : : /* The stored nV is dependent on the mask type produced. */
10463 : 166 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10464 : : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10465 : : == rgm->factor);
10466 : 166 : nvectors = rgm->factor;
10467 : :
10468 : : /* Populate the rgroup's mask array, if this is the first time we've
10469 : : used it. */
10470 : 166 : if (rgm->controls.is_empty ())
10471 : : {
10472 : 16 : rgm->controls.safe_grow_cleared (nvectors, true);
10473 : 98 : for (unsigned int i = 0; i < nvectors; ++i)
10474 : : {
10475 : 82 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10476 : : /* Provide a dummy definition until the real one is available. */
10477 : 82 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10478 : 82 : rgm->controls[i] = mask;
10479 : : }
10480 : : }
10481 : 166 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10482 : : TYPE_VECTOR_SUBPARTS (vectype)))
10483 : 150 : return rgm->controls[index];
10484 : :
10485 : : /* Split the vector if needed. Since we are dealing with integer mode
10486 : : masks with AVX512 we can operate on the integer representation
10487 : : performing the whole vector shifting. */
10488 : 16 : unsigned HOST_WIDE_INT factor;
10489 : 16 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10490 : 16 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
10491 : 0 : gcc_assert (ok);
10492 : 16 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10493 : 16 : tree mask_type = truth_type_for (vectype);
10494 : 16 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10495 : 16 : unsigned vi = index / factor;
10496 : 16 : unsigned vpart = index % factor;
10497 : 16 : tree vec = rgm->controls[vi];
10498 : 16 : gimple_seq seq = NULL;
10499 : 16 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10500 : 16 : lang_hooks.types.type_for_mode
10501 : 16 : (TYPE_MODE (rgm->type), 1), vec);
10502 : : /* For integer mode masks simply shift the right bits into position. */
10503 : 16 : if (vpart != 0)
10504 : 12 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10505 : : build_int_cst (integer_type_node,
10506 : 24 : (TYPE_VECTOR_SUBPARTS (vectype)
10507 : 12 : * vpart)));
10508 : 16 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10509 : 16 : (TYPE_MODE (mask_type), 1), vec);
10510 : 16 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10511 : 16 : if (seq)
10512 : 16 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10513 : 16 : return vec;
10514 : : }
10515 : : else
10516 : 0 : gcc_unreachable ();
10517 : : }
10518 : :
10519 : : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10520 : : lengths for controlling an operation on VECTYPE. The operation splits
10521 : : each element of VECTYPE into FACTOR separate subelements, measuring the
10522 : : length as a number of these subelements. */
10523 : :
10524 : : void
10525 : 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10526 : : unsigned int nvectors, tree vectype, unsigned int factor)
10527 : : {
10528 : 0 : gcc_assert (nvectors != 0);
10529 : 0 : if (lens->length () < nvectors)
10530 : 0 : lens->safe_grow_cleared (nvectors, true);
10531 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10532 : :
10533 : : /* The number of scalars per iteration, scalar occupied bytes and
10534 : : the number of vectors are both compile-time constants. */
10535 : 0 : unsigned int nscalars_per_iter
10536 : 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10537 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10538 : :
10539 : 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10540 : : {
10541 : : /* For now, we only support cases in which all loads and stores fall back
10542 : : to VnQI or none do. */
10543 : 0 : gcc_assert (!rgl->max_nscalars_per_iter
10544 : : || (rgl->factor == 1 && factor == 1)
10545 : : || (rgl->max_nscalars_per_iter * rgl->factor
10546 : : == nscalars_per_iter * factor));
10547 : 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
10548 : 0 : rgl->type = vectype;
10549 : 0 : rgl->factor = factor;
10550 : : }
10551 : 0 : }
10552 : :
10553 : : /* Given a complete set of lengths LENS, extract length number INDEX
10554 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10555 : : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10556 : : multipled by the number of elements that should be processed.
10557 : : Insert any set-up statements before GSI. */
10558 : :
10559 : : tree
10560 : 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10561 : : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10562 : : unsigned int index, unsigned int factor)
10563 : : {
10564 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10565 : 0 : bool use_bias_adjusted_len =
10566 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10567 : :
10568 : : /* Populate the rgroup's len array, if this is the first time we've
10569 : : used it. */
10570 : 0 : if (rgl->controls.is_empty ())
10571 : : {
10572 : 0 : rgl->controls.safe_grow_cleared (nvectors, true);
10573 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
10574 : : {
10575 : 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10576 : 0 : gcc_assert (len_type != NULL_TREE);
10577 : :
10578 : 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10579 : :
10580 : : /* Provide a dummy definition until the real one is available. */
10581 : 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10582 : 0 : rgl->controls[i] = len;
10583 : :
10584 : 0 : if (use_bias_adjusted_len)
10585 : : {
10586 : 0 : gcc_assert (i == 0);
10587 : 0 : tree adjusted_len =
10588 : 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10589 : 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10590 : 0 : rgl->bias_adjusted_ctrl = adjusted_len;
10591 : : }
10592 : : }
10593 : : }
10594 : :
10595 : 0 : if (use_bias_adjusted_len)
10596 : 0 : return rgl->bias_adjusted_ctrl;
10597 : :
10598 : 0 : tree loop_len = rgl->controls[index];
10599 : 0 : if (rgl->factor == 1 && factor == 1)
10600 : : {
10601 : 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10602 : 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10603 : 0 : if (maybe_ne (nunits1, nunits2))
10604 : : {
10605 : : /* A loop len for data type X can be reused for data type Y
10606 : : if X has N times more elements than Y and if Y's elements
10607 : : are N times bigger than X's. */
10608 : 0 : gcc_assert (multiple_p (nunits1, nunits2));
10609 : 0 : factor = exact_div (nunits1, nunits2).to_constant ();
10610 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10611 : 0 : gimple_seq seq = NULL;
10612 : 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10613 : 0 : build_int_cst (iv_type, factor));
10614 : 0 : if (seq)
10615 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10616 : : }
10617 : : }
10618 : : return loop_len;
10619 : : }
10620 : :
10621 : : /* Generate the tree for the loop len mask and return it. Given the lens,
10622 : : nvectors, vectype, index and factor to gen the len mask as below.
10623 : :
10624 : : tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
10625 : : */
10626 : : tree
10627 : 0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10628 : : gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
10629 : : unsigned int nvectors, tree vectype, tree stmt,
10630 : : unsigned int index, unsigned int factor)
10631 : : {
10632 : 0 : tree all_one_mask = build_all_ones_cst (vectype);
10633 : 0 : tree all_zero_mask = build_zero_cst (vectype);
10634 : 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
10635 : : factor);
10636 : 0 : tree bias = build_int_cst (intQI_type_node,
10637 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
10638 : 0 : tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
10639 : 0 : gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
10640 : : all_one_mask, all_zero_mask, len,
10641 : : bias);
10642 : 0 : gimple_call_set_lhs (call, len_mask);
10643 : 0 : gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
10644 : :
10645 : 0 : return len_mask;
10646 : : }
10647 : :
10648 : : /* Scale profiling counters by estimation for LOOP which is vectorized
10649 : : by factor VF.
10650 : : If FLAT is true, the loop we started with had unrealistically flat
10651 : : profile. */
10652 : :
10653 : : static void
10654 : 59211 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
10655 : : {
10656 : : /* For flat profiles do not scale down proportionally by VF and only
10657 : : cap by known iteration count bounds. */
10658 : 59211 : if (flat)
10659 : : {
10660 : 33618 : if (dump_file && (dump_flags & TDF_DETAILS))
10661 : 4958 : fprintf (dump_file,
10662 : : "Vectorized loop profile seems flat; not scaling iteration "
10663 : : "count down by the vectorization factor %i\n", vf);
10664 : 33618 : scale_loop_profile (loop, profile_probability::always (),
10665 : : get_likely_max_loop_iterations_int (loop));
10666 : 33618 : return;
10667 : : }
10668 : : /* Loop body executes VF fewer times and exit increases VF times. */
10669 : 25593 : profile_count entry_count = loop_preheader_edge (loop)->count ();
10670 : :
10671 : : /* If we have unreliable loop profile avoid dropping entry
10672 : : count below header count. This can happen since loops
10673 : : has unrealistically low trip counts. */
10674 : 25593 : while (vf > 1
10675 : 26809 : && loop->header->count > entry_count
10676 : 54462 : && loop->header->count < entry_count * vf)
10677 : : {
10678 : 2060 : if (dump_file && (dump_flags & TDF_DETAILS))
10679 : 149 : fprintf (dump_file,
10680 : : "Vectorization factor %i seems too large for profile "
10681 : : "prevoiusly believed to be consistent; reducing.\n", vf);
10682 : 2060 : vf /= 2;
10683 : : }
10684 : :
10685 : 25593 : if (entry_count.nonzero_p ())
10686 : 25593 : set_edge_probability_and_rescale_others
10687 : 25593 : (exit_e,
10688 : 25593 : entry_count.probability_in (loop->header->count / vf));
10689 : : /* Avoid producing very large exit probability when we do not have
10690 : : sensible profile. */
10691 : 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
10692 : 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10693 : 25593 : loop->latch->count = single_pred_edge (loop->latch)->count ();
10694 : :
10695 : 25593 : scale_loop_profile (loop, profile_probability::always () / vf,
10696 : : get_likely_max_loop_iterations_int (loop));
10697 : : }
10698 : :
10699 : : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10700 : : original loop that has now been vectorized.
10701 : :
10702 : : The inits of the data_references need to be advanced with the number of
10703 : : iterations of the main loop. This has been computed in vect_do_peeling and
10704 : : is stored in parameter ADVANCE.
10705 : :
10706 : : Since the loop_vec_info of this EPILOGUE was constructed for the original
10707 : : loop, its stmt_vec_infos all point to the original statements. These need
10708 : : to be updated to point to their corresponding copies.
10709 : :
10710 : : The data_reference's connections also need to be updated. Their
10711 : : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10712 : : stmt_vec_infos, their statements need to point to their corresponding
10713 : : copy. */
10714 : :
10715 : : static void
10716 : 6766 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10717 : : {
10718 : 6766 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10719 : 6766 : hash_map<tree,tree> mapping;
10720 : 6766 : gimple *orig_stmt, *new_stmt;
10721 : 6766 : gimple_stmt_iterator epilogue_gsi;
10722 : 6766 : gphi_iterator epilogue_phi_gsi;
10723 : 6766 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10724 : 6766 : basic_block *epilogue_bbs = get_loop_body (epilogue);
10725 : 6766 : unsigned i;
10726 : :
10727 : 6766 : free (LOOP_VINFO_BBS (epilogue_vinfo));
10728 : 6766 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10729 : 6766 : LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
10730 : :
10731 : : /* The EPILOGUE loop is a copy of the original loop so they share the same
10732 : : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10733 : : point to the copied statements. */
10734 : 20298 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10735 : : {
10736 : 13532 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10737 : 34941 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10738 : : {
10739 : 21409 : new_stmt = epilogue_phi_gsi.phi ();
10740 : :
10741 : 21409 : gcc_assert (gimple_uid (new_stmt) > 0);
10742 : 21409 : stmt_vinfo
10743 : 21409 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10744 : :
10745 : 21409 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10746 : : }
10747 : :
10748 : 27064 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10749 : 135211 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10750 : : {
10751 : 121679 : new_stmt = gsi_stmt (epilogue_gsi);
10752 : 121679 : if (is_gimple_debug (new_stmt))
10753 : 21812 : continue;
10754 : :
10755 : 99867 : gcc_assert (gimple_uid (new_stmt) > 0);
10756 : 99867 : stmt_vinfo
10757 : 99867 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10758 : :
10759 : 99867 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10760 : :
10761 : 99867 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10762 : 99867 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10763 : : {
10764 : 1826 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10765 : : /* Set BB such that the assert in
10766 : : 'get_initial_defs_for_reduction' is able to determine that
10767 : : the BB of the related stmt is inside this loop. */
10768 : 1826 : gimple_set_bb (stmt,
10769 : : gimple_bb (new_stmt));
10770 : 1826 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10771 : 1826 : gcc_assert (related_vinfo == NULL
10772 : : || related_vinfo == stmt_vinfo);
10773 : : }
10774 : : }
10775 : : }
10776 : :
10777 : 6766 : struct data_reference *dr;
10778 : 6766 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10779 : 28901 : FOR_EACH_VEC_ELT (datarefs, i, dr)
10780 : : {
10781 : 22135 : orig_stmt = DR_STMT (dr);
10782 : 22135 : gcc_assert (gimple_uid (orig_stmt) > 0);
10783 : 22135 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10784 : 22135 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10785 : : }
10786 : :
10787 : : /* Advance data_reference's with the number of iterations of the previous
10788 : : loop and its prologue. */
10789 : 6766 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10790 : :
10791 : : /* Remember the advancement made. */
10792 : 6766 : LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
10793 : 6766 : }
10794 : :
10795 : : /* When vectorizing early break statements instructions that happen before
10796 : : the early break in the current BB need to be moved to after the early
10797 : : break. This function deals with that and assumes that any validity
10798 : : checks has already been performed.
10799 : :
10800 : : While moving the instructions if it encounters a VUSE or VDEF it then
10801 : : corrects the VUSES as it moves the statements along. GDEST is the location
10802 : : in which to insert the new statements. */
10803 : :
10804 : : static void
10805 : 1418 : move_early_exit_stmts (loop_vec_info loop_vinfo)
10806 : : {
10807 : 1418 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
10808 : :
10809 : 1418 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
10810 : 1214 : return;
10811 : :
10812 : : /* Move all stmts that need moving. */
10813 : 204 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
10814 : 204 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
10815 : :
10816 : 204 : tree last_seen_vuse = NULL_TREE;
10817 : 503 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
10818 : : {
10819 : : /* We have to update crossed degenerate virtual PHIs. Simply
10820 : : elide them. */
10821 : 299 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
10822 : : {
10823 : 7 : tree vdef = gimple_phi_result (vphi);
10824 : 7 : tree vuse = gimple_phi_arg_def (vphi, 0);
10825 : 7 : imm_use_iterator iter;
10826 : 7 : use_operand_p use_p;
10827 : 7 : gimple *use_stmt;
10828 : 23 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
10829 : : {
10830 : 48 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
10831 : 16 : SET_USE (use_p, vuse);
10832 : 7 : }
10833 : 7 : auto gsi = gsi_for_stmt (stmt);
10834 : 7 : remove_phi_node (&gsi, true);
10835 : 7 : last_seen_vuse = vuse;
10836 : 7 : continue;
10837 : 7 : }
10838 : :
10839 : : /* Check to see if statement is still required for vect or has been
10840 : : elided. */
10841 : 292 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
10842 : 292 : if (!stmt_info)
10843 : 0 : continue;
10844 : :
10845 : 292 : if (dump_enabled_p ())
10846 : 147 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
10847 : :
10848 : 292 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
10849 : 292 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
10850 : 584 : last_seen_vuse = gimple_vuse (stmt);
10851 : : }
10852 : :
10853 : : /* Update all the stmts with their new reaching VUSES. */
10854 : 628 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
10855 : : {
10856 : 178 : if (dump_enabled_p ())
10857 : 142 : dump_printf_loc (MSG_NOTE, vect_location,
10858 : : "updating vuse to %T for load %G",
10859 : : last_seen_vuse, p);
10860 : 178 : gimple_set_vuse (p, last_seen_vuse);
10861 : 178 : update_stmt (p);
10862 : : }
10863 : :
10864 : : /* And update the LC PHIs on exits. */
10865 : 1026 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10866 : 414 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
10867 : 218 : if (gphi *phi = get_virtual_phi (e->dest))
10868 : 422 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
10869 : : }
10870 : :
10871 : : /* Function vect_transform_loop.
10872 : :
10873 : : The analysis phase has determined that the loop is vectorizable.
10874 : : Vectorize the loop - created vectorized stmts to replace the scalar
10875 : : stmts in the loop, and update the loop exit condition.
10876 : : Returns scalar epilogue loop if any. */
10877 : :
10878 : : class loop *
10879 : 59211 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
10880 : : {
10881 : 59211 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10882 : 59211 : class loop *epilogue = NULL;
10883 : 59211 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
10884 : 59211 : int nbbs = loop->num_nodes;
10885 : 59211 : int i;
10886 : 59211 : tree niters_vector = NULL_TREE;
10887 : 59211 : tree step_vector = NULL_TREE;
10888 : 59211 : tree niters_vector_mult_vf = NULL_TREE;
10889 : 59211 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10890 : 59211 : unsigned int lowest_vf = constant_lower_bound (vf);
10891 : 59211 : gimple *stmt;
10892 : 59211 : bool check_profitability = false;
10893 : 59211 : unsigned int th;
10894 : 59211 : bool flat = maybe_flat_loop_profile (loop);
10895 : :
10896 : 59211 : DUMP_VECT_SCOPE ("vec_transform_loop");
10897 : :
10898 : 59211 : if (! LOOP_VINFO_EPILOGUE_P (loop_vinfo))
10899 : 52445 : loop_vinfo->shared->check_datarefs ();
10900 : :
10901 : : /* Use the more conservative vectorization threshold. If the number
10902 : : of iterations is constant assume the cost check has been performed
10903 : : by our caller. If the threshold makes all loops profitable that
10904 : : run at least the (estimated) vectorization factor number of times
10905 : : checking is pointless, too. */
10906 : 59211 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
10907 : 59211 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
10908 : : {
10909 : 18027 : if (dump_enabled_p ())
10910 : 162 : dump_printf_loc (MSG_NOTE, vect_location,
10911 : : "Profitability threshold is %d loop iterations.\n",
10912 : : th);
10913 : : check_profitability = true;
10914 : : }
10915 : :
10916 : : /* Make sure there exists a single-predecessor exit bb. Do this before
10917 : : versioning. */
10918 : 59211 : edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10919 : 59211 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10920 : : {
10921 : 13028 : split_loop_exit_edge (e, true);
10922 : 13028 : if (dump_enabled_p ())
10923 : 1957 : dump_printf (MSG_NOTE, "split exit edge\n");
10924 : : }
10925 : :
10926 : : /* Version the loop first, if required, so the profitability check
10927 : : comes first. */
10928 : :
10929 : 59211 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
10930 : : {
10931 : 3685 : class loop *sloop
10932 : 3685 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
10933 : 3685 : sloop->force_vectorize = false;
10934 : 3685 : check_profitability = false;
10935 : : }
10936 : :
10937 : : /* Make sure there exists a single-predecessor exit bb also on the
10938 : : scalar loop copy. Do this after versioning but before peeling
10939 : : so CFG structure is fine for both scalar and if-converted loop
10940 : : to make slpeel_duplicate_current_defs_from_edges face matched
10941 : : loop closed PHI nodes on the exit. */
10942 : 59211 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
10943 : : {
10944 : 7299 : e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
10945 : 7299 : if (! single_pred_p (e->dest))
10946 : : {
10947 : 7048 : split_loop_exit_edge (e, true);
10948 : 7048 : if (dump_enabled_p ())
10949 : 1107 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
10950 : : }
10951 : : }
10952 : :
10953 : 59211 : tree niters = vect_build_loop_niters (loop_vinfo);
10954 : 59211 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
10955 : 59211 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
10956 : 59211 : bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
10957 : 59211 : tree advance;
10958 : 59211 : drs_init_vec orig_drs_init;
10959 : :
10960 : 59211 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
10961 : : &step_vector, &niters_vector_mult_vf, th,
10962 : : check_profitability, niters_no_overflow,
10963 : : &advance);
10964 : 59211 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
10965 : 59211 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
10966 : : {
10967 : : /* Ifcvt duplicates loop preheader, loop body and produces an basic
10968 : : block after loop exit. We need to scale all that. */
10969 : 91 : basic_block preheader
10970 : 91 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
10971 : 91 : preheader->count
10972 : : = preheader->count.apply_probability
10973 : 91 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
10974 : 91 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
10975 : : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
10976 : 91 : LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
10977 : : }
10978 : :
10979 : 59211 : if (niters_vector == NULL_TREE)
10980 : : {
10981 : 25931 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
10982 : 25931 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
10983 : 52592 : && known_eq (lowest_vf, vf))
10984 : : {
10985 : 25928 : niters_vector
10986 : 25928 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
10987 : 25928 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
10988 : 25928 : step_vector = build_one_cst (TREE_TYPE (niters));
10989 : : }
10990 : 736 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10991 : 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
10992 : : &step_vector, niters_no_overflow);
10993 : : else
10994 : : /* vect_do_peeling subtracted the number of peeled prologue
10995 : : iterations from LOOP_VINFO_NITERS. */
10996 : 735 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
10997 : : &niters_vector, &step_vector,
10998 : : niters_no_overflow);
10999 : : }
11000 : :
11001 : : /* 1) Make sure the loop header has exactly two entries
11002 : : 2) Make sure we have a preheader basic block. */
11003 : :
11004 : 59211 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11005 : :
11006 : 59211 : split_edge (loop_preheader_edge (loop));
11007 : :
11008 : 59211 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11009 : : /* This will deal with any possible peeling. */
11010 : 1 : vect_prepare_for_masked_peels (loop_vinfo);
11011 : :
11012 : : /* Handle any code motion that we need to for early-break vectorization after
11013 : : we've done peeling but just before we start vectorizing. */
11014 : 59211 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11015 : 1418 : move_early_exit_stmts (loop_vinfo);
11016 : :
11017 : : /* Remove existing clobber stmts and prefetches. */
11018 : 180781 : for (i = 0; i < nbbs; i++)
11019 : : {
11020 : 121570 : basic_block bb = bbs[i];
11021 : 1049089 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
11022 : : {
11023 : 805949 : stmt = gsi_stmt (si);
11024 : 805949 : if (gimple_clobber_p (stmt)
11025 : 805949 : || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
11026 : : {
11027 : 90 : unlink_stmt_vdef (stmt);
11028 : 90 : gsi_remove (&si, true);
11029 : 90 : release_defs (stmt);
11030 : : }
11031 : : else
11032 : 805859 : gsi_next (&si);
11033 : : }
11034 : : }
11035 : :
11036 : : /* Schedule the SLP instances. */
11037 : 59211 : if (!loop_vinfo->slp_instances.is_empty ())
11038 : : {
11039 : 59211 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11040 : 59211 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11041 : : }
11042 : :
11043 : : /* Generate the loop invariant statements. */
11044 : 59211 : if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
11045 : : {
11046 : 74 : if (dump_enabled_p ())
11047 : 30 : dump_printf_loc (MSG_NOTE, vect_location,
11048 : : "------>generating loop invariant statements\n");
11049 : 74 : gimple_stmt_iterator gsi;
11050 : 74 : gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
11051 : 74 : gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
11052 : : GSI_CONTINUE_LINKING);
11053 : : }
11054 : :
11055 : : /* Stub out scalar statements that must not survive vectorization and
11056 : : were not picked as relevant in any SLP instance.
11057 : : Doing this here helps with grouped statements, or statements that
11058 : : are involved in patterns. */
11059 : 180781 : for (i = 0; i < nbbs; i++)
11060 : : {
11061 : 121570 : basic_block bb = bbs[i];
11062 : 121570 : stmt_vec_info stmt_info;
11063 : 243140 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11064 : 1615939 : !gsi_end_p (gsi); gsi_next (&gsi))
11065 : : {
11066 : 1494369 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11067 : 5577 : if (!call || !gimple_call_internal_p (call))
11068 : 1489940 : continue;
11069 : 4429 : internal_fn ifn = gimple_call_internal_fn (call);
11070 : 4429 : if (ifn == IFN_MASK_LOAD)
11071 : : {
11072 : 584 : tree lhs = gimple_get_lhs (call);
11073 : 584 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11074 : : {
11075 : 0 : tree zero = build_zero_cst (TREE_TYPE (lhs));
11076 : 0 : gimple *new_stmt = gimple_build_assign (lhs, zero);
11077 : 0 : gsi_replace (&gsi, new_stmt, true);
11078 : : }
11079 : : }
11080 : 3845 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11081 : : {
11082 : 1713 : tree lhs = gimple_get_lhs (call);
11083 : 1713 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11084 : : {
11085 : 0 : tree else_arg
11086 : 0 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11087 : 0 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11088 : 0 : gsi_replace (&gsi, new_stmt, true);
11089 : : }
11090 : : }
11091 : 2132 : else if (ifn == IFN_MASK_CALL
11092 : 4 : && (stmt_info = loop_vinfo->lookup_stmt (call))
11093 : 4 : && !STMT_VINFO_RELEVANT_P (stmt_info)
11094 : 2136 : && !STMT_VINFO_LIVE_P (stmt_info))
11095 : : {
11096 : 4 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11097 : 4 : loop_vinfo->remove_stmt (stmt_info);
11098 : : }
11099 : : }
11100 : : }
11101 : :
11102 : : /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11103 : : a zero NITERS becomes a nonzero NITERS_VECTOR. */
11104 : 59211 : if (integer_onep (step_vector))
11105 : 59197 : niters_no_overflow = true;
11106 : 59211 : vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11107 : : niters_vector, step_vector, niters_vector_mult_vf,
11108 : 59211 : !niters_no_overflow);
11109 : :
11110 : 59211 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11111 : :
11112 : : /* True if the final iteration might not handle a full vector's
11113 : : worth of scalar iterations. */
11114 : 118422 : bool final_iter_may_be_partial
11115 : 59211 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11116 : 59211 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
11117 : :
11118 : : /* +1 to convert latch counts to loop iteration counts. */
11119 : 59211 : int bias_for_lowest = 1;
11120 : :
11121 : : /* When we are peeling for gaps then we take away one scalar iteration
11122 : : from the vector loop. Thus we can adjust the upper bound by one
11123 : : scalar iteration. But only when we know the bound applies to the
11124 : : IV exit test which might not be true when we have multiple exits. */
11125 : 59211 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11126 : 115245 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11127 : :
11128 : 59211 : int bias_for_assumed = bias_for_lowest;
11129 : 59211 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11130 : 59211 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11131 : : {
11132 : : /* When the amount of peeling is known at compile time, the first
11133 : : iteration will have exactly alignment_npeels active elements.
11134 : : In the worst case it will have at least one. */
11135 : 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11136 : 1 : bias_for_lowest += lowest_vf - min_first_active;
11137 : 1 : bias_for_assumed += assumed_vf - min_first_active;
11138 : : }
11139 : : /* In these calculations the "- 1" converts loop iteration counts
11140 : : back to latch counts. */
11141 : 59211 : if (loop->any_upper_bound)
11142 : : {
11143 : 59211 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11144 : 59211 : loop->nb_iterations_upper_bound
11145 : 59211 : = (final_iter_may_be_partial
11146 : 60643 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11147 : 2864 : lowest_vf) - 1
11148 : 57779 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11149 : 115558 : lowest_vf) - 1);
11150 : 59211 : if (main_vinfo
11151 : : /* Both peeling for alignment and peeling for gaps can end up
11152 : : with the scalar epilogue running for more than VF-1 iterations. */
11153 : 6766 : && !main_vinfo->peeling_for_alignment
11154 : 6718 : && !main_vinfo->peeling_for_gaps)
11155 : : {
11156 : 6561 : unsigned int bound;
11157 : 6561 : poly_uint64 main_iters
11158 : 6561 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11159 : : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11160 : 6561 : main_iters
11161 : 6561 : = upper_bound (main_iters,
11162 : 6561 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11163 : 13122 : if (can_div_away_from_zero_p (main_iters,
11164 : 6561 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11165 : : &bound))
11166 : 6561 : loop->nb_iterations_upper_bound
11167 : 6561 : = wi::umin ((bound_wide_int) (bound - 1),
11168 : 6561 : loop->nb_iterations_upper_bound);
11169 : : }
11170 : : }
11171 : 59211 : if (loop->any_likely_upper_bound)
11172 : 59211 : loop->nb_iterations_likely_upper_bound
11173 : 59211 : = (final_iter_may_be_partial
11174 : 60643 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11175 : 1432 : + bias_for_lowest, lowest_vf) - 1
11176 : 57779 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11177 : 59211 : + bias_for_lowest, lowest_vf) - 1);
11178 : 59211 : if (loop->any_estimate)
11179 : 34026 : loop->nb_iterations_estimate
11180 : 34026 : = (final_iter_may_be_partial
11181 : 34813 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11182 : 1574 : assumed_vf) - 1
11183 : 33239 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11184 : 67265 : assumed_vf) - 1);
11185 : 59211 : scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11186 : : assumed_vf, flat);
11187 : :
11188 : 59211 : if (dump_enabled_p ())
11189 : : {
11190 : 10282 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11191 : : {
11192 : 8969 : dump_printf_loc (MSG_NOTE, vect_location,
11193 : : "LOOP VECTORIZED\n");
11194 : 8969 : if (loop->inner)
11195 : 281 : dump_printf_loc (MSG_NOTE, vect_location,
11196 : : "OUTER LOOP VECTORIZED\n");
11197 : 8969 : dump_printf (MSG_NOTE, "\n");
11198 : : }
11199 : : else
11200 : 1313 : dump_printf_loc (MSG_NOTE, vect_location,
11201 : : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11202 : 1313 : GET_MODE_NAME (loop_vinfo->vector_mode));
11203 : : }
11204 : :
11205 : : /* Loops vectorized with a variable factor won't benefit from
11206 : : unrolling/peeling. */
11207 : 59211 : if (!vf.is_constant ())
11208 : : {
11209 : : loop->unroll = 1;
11210 : : if (dump_enabled_p ())
11211 : : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11212 : : " variable-length vectorization factor\n");
11213 : : }
11214 : :
11215 : : /* When we have unrolled the loop due to a user requested value we should
11216 : : leave it up to the RTL unroll heuristics to determine if it's still worth
11217 : : while to unroll more. */
11218 : 59211 : if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
11219 : 40 : loop->unroll = 0;
11220 : :
11221 : : /* Free SLP instances here because otherwise stmt reference counting
11222 : : won't work. */
11223 : : slp_instance instance;
11224 : 148701 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11225 : 89490 : vect_free_slp_instance (instance);
11226 : 59211 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11227 : : /* Clear-up safelen field since its value is invalid after vectorization
11228 : : since vectorized loop can have loop-carried dependencies. */
11229 : 59211 : loop->safelen = 0;
11230 : :
11231 : 59211 : if (epilogue)
11232 : : {
11233 : : /* Accumulate past advancements made. */
11234 : 6766 : if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
11235 : 88 : advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
11236 : : LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
11237 : : advance);
11238 : 6766 : update_epilogue_loop_vinfo (epilogue, advance);
11239 : :
11240 : 6766 : epilogue->simduid = loop->simduid;
11241 : 6766 : epilogue->force_vectorize = loop->force_vectorize;
11242 : 6766 : epilogue->dont_vectorize = false;
11243 : : }
11244 : :
11245 : 59211 : return epilogue;
11246 : 59211 : }
11247 : :
11248 : : /* The code below is trying to perform simple optimization - revert
11249 : : if-conversion for masked stores, i.e. if the mask of a store is zero
11250 : : do not perform it and all stored value producers also if possible.
11251 : : For example,
11252 : : for (i=0; i<n; i++)
11253 : : if (c[i])
11254 : : {
11255 : : p1[i] += 1;
11256 : : p2[i] = p3[i] +2;
11257 : : }
11258 : : this transformation will produce the following semi-hammock:
11259 : :
11260 : : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11261 : : {
11262 : : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11263 : : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11264 : : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11265 : : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11266 : : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11267 : : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11268 : : }
11269 : : */
11270 : :
11271 : : void
11272 : 493 : optimize_mask_stores (class loop *loop)
11273 : : {
11274 : 493 : basic_block *bbs = get_loop_body (loop);
11275 : 493 : unsigned nbbs = loop->num_nodes;
11276 : 493 : unsigned i;
11277 : 493 : basic_block bb;
11278 : 493 : class loop *bb_loop;
11279 : 493 : gimple_stmt_iterator gsi;
11280 : 493 : gimple *stmt;
11281 : 493 : auto_vec<gimple *> worklist;
11282 : 493 : auto_purge_vect_location sentinel;
11283 : :
11284 : 493 : vect_location = find_loop_location (loop);
11285 : : /* Pick up all masked stores in loop if any. */
11286 : 1972 : for (i = 0; i < nbbs; i++)
11287 : : {
11288 : 986 : bb = bbs[i];
11289 : 16259 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11290 : 14287 : gsi_next (&gsi))
11291 : : {
11292 : 14287 : stmt = gsi_stmt (gsi);
11293 : 14287 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11294 : 647 : worklist.safe_push (stmt);
11295 : : }
11296 : : }
11297 : :
11298 : 493 : free (bbs);
11299 : 493 : if (worklist.is_empty ())
11300 : 68 : return;
11301 : :
11302 : : /* Loop has masked stores. */
11303 : 1055 : while (!worklist.is_empty ())
11304 : : {
11305 : 630 : gimple *last, *last_store;
11306 : 630 : edge e, efalse;
11307 : 630 : tree mask;
11308 : 630 : basic_block store_bb, join_bb;
11309 : 630 : gimple_stmt_iterator gsi_to;
11310 : 630 : tree vdef, new_vdef;
11311 : 630 : gphi *phi;
11312 : 630 : tree vectype;
11313 : 630 : tree zero;
11314 : :
11315 : 630 : last = worklist.pop ();
11316 : 630 : mask = gimple_call_arg (last, 2);
11317 : 630 : bb = gimple_bb (last);
11318 : : /* Create then_bb and if-then structure in CFG, then_bb belongs to
11319 : : the same loop as if_bb. It could be different to LOOP when two
11320 : : level loop-nest is vectorized and mask_store belongs to the inner
11321 : : one. */
11322 : 630 : e = split_block (bb, last);
11323 : 630 : bb_loop = bb->loop_father;
11324 : 630 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11325 : 630 : join_bb = e->dest;
11326 : 630 : store_bb = create_empty_bb (bb);
11327 : 630 : add_bb_to_loop (store_bb, bb_loop);
11328 : 630 : e->flags = EDGE_TRUE_VALUE;
11329 : 630 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11330 : : /* Put STORE_BB to likely part. */
11331 : 630 : efalse->probability = profile_probability::likely ();
11332 : 630 : e->probability = efalse->probability.invert ();
11333 : 630 : store_bb->count = efalse->count ();
11334 : 630 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11335 : 630 : if (dom_info_available_p (CDI_DOMINATORS))
11336 : 630 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11337 : 630 : if (dump_enabled_p ())
11338 : 299 : dump_printf_loc (MSG_NOTE, vect_location,
11339 : : "Create new block %d to sink mask stores.",
11340 : : store_bb->index);
11341 : : /* Create vector comparison with boolean result. */
11342 : 630 : vectype = TREE_TYPE (mask);
11343 : 630 : zero = build_zero_cst (vectype);
11344 : 630 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11345 : 630 : gsi = gsi_last_bb (bb);
11346 : 630 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11347 : : /* Create new PHI node for vdef of the last masked store:
11348 : : .MEM_2 = VDEF <.MEM_1>
11349 : : will be converted to
11350 : : .MEM.3 = VDEF <.MEM_1>
11351 : : and new PHI node will be created in join bb
11352 : : .MEM_2 = PHI <.MEM_1, .MEM_3>
11353 : : */
11354 : 630 : vdef = gimple_vdef (last);
11355 : 630 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
11356 : 630 : gimple_set_vdef (last, new_vdef);
11357 : 630 : phi = create_phi_node (vdef, join_bb);
11358 : 630 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11359 : :
11360 : : /* Put all masked stores with the same mask to STORE_BB if possible. */
11361 : 664 : while (true)
11362 : : {
11363 : 647 : gimple_stmt_iterator gsi_from;
11364 : 647 : gimple *stmt1 = NULL;
11365 : :
11366 : : /* Move masked store to STORE_BB. */
11367 : 647 : last_store = last;
11368 : 647 : gsi = gsi_for_stmt (last);
11369 : 647 : gsi_from = gsi;
11370 : : /* Shift GSI to the previous stmt for further traversal. */
11371 : 647 : gsi_prev (&gsi);
11372 : 647 : gsi_to = gsi_start_bb (store_bb);
11373 : 647 : gsi_move_before (&gsi_from, &gsi_to);
11374 : : /* Setup GSI_TO to the non-empty block start. */
11375 : 647 : gsi_to = gsi_start_bb (store_bb);
11376 : 647 : if (dump_enabled_p ())
11377 : 315 : dump_printf_loc (MSG_NOTE, vect_location,
11378 : : "Move stmt to created bb\n%G", last);
11379 : : /* Move all stored value producers if possible. */
11380 : 4426 : while (!gsi_end_p (gsi))
11381 : : {
11382 : 4425 : tree lhs;
11383 : 4425 : imm_use_iterator imm_iter;
11384 : 4425 : use_operand_p use_p;
11385 : 4425 : bool res;
11386 : :
11387 : : /* Skip debug statements. */
11388 : 4425 : if (is_gimple_debug (gsi_stmt (gsi)))
11389 : : {
11390 : 3 : gsi_prev (&gsi);
11391 : 2770 : continue;
11392 : : }
11393 : 4422 : stmt1 = gsi_stmt (gsi);
11394 : : /* Do not consider statements writing to memory or having
11395 : : volatile operand. */
11396 : 8724 : if (gimple_vdef (stmt1)
11397 : 8724 : || gimple_has_volatile_ops (stmt1))
11398 : : break;
11399 : 4302 : gsi_from = gsi;
11400 : 4302 : gsi_prev (&gsi);
11401 : 4302 : lhs = gimple_get_lhs (stmt1);
11402 : 4302 : if (!lhs)
11403 : : break;
11404 : :
11405 : : /* LHS of vectorized stmt must be SSA_NAME. */
11406 : 4302 : if (TREE_CODE (lhs) != SSA_NAME)
11407 : : break;
11408 : :
11409 : 4302 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11410 : : {
11411 : : /* Remove dead scalar statement. */
11412 : 3059 : if (has_zero_uses (lhs))
11413 : : {
11414 : 2767 : gsi_remove (&gsi_from, true);
11415 : 2767 : release_defs (stmt1);
11416 : 2767 : continue;
11417 : : }
11418 : : }
11419 : :
11420 : : /* Check that LHS does not have uses outside of STORE_BB. */
11421 : 1535 : res = true;
11422 : 2634 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11423 : : {
11424 : 1625 : gimple *use_stmt;
11425 : 1625 : use_stmt = USE_STMT (use_p);
11426 : 1625 : if (is_gimple_debug (use_stmt))
11427 : 0 : continue;
11428 : 1625 : if (gimple_bb (use_stmt) != store_bb)
11429 : : {
11430 : : res = false;
11431 : : break;
11432 : : }
11433 : : }
11434 : 1535 : if (!res)
11435 : : break;
11436 : :
11437 : 1009 : if (gimple_vuse (stmt1)
11438 : 1443 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
11439 : : break;
11440 : :
11441 : : /* Can move STMT1 to STORE_BB. */
11442 : 1009 : if (dump_enabled_p ())
11443 : 529 : dump_printf_loc (MSG_NOTE, vect_location,
11444 : : "Move stmt to created bb\n%G", stmt1);
11445 : 1009 : gsi_move_before (&gsi_from, &gsi_to);
11446 : : /* Shift GSI_TO for further insertion. */
11447 : 2018 : gsi_prev (&gsi_to);
11448 : : }
11449 : : /* Put other masked stores with the same mask to STORE_BB. */
11450 : 647 : if (worklist.is_empty ()
11451 : 222 : || gimple_call_arg (worklist.last (), 2) != mask
11452 : 17 : || worklist.last () != stmt1)
11453 : : break;
11454 : 17 : last = worklist.pop ();
11455 : 17 : }
11456 : 1260 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11457 : : }
11458 : 493 : }
11459 : :
11460 : : /* Decide whether it is possible to use a zero-based induction variable
11461 : : when vectorizing LOOP_VINFO with partial vectors. If it is, return
11462 : : the value that the induction variable must be able to hold in order
11463 : : to ensure that the rgroups eventually have no active vector elements.
11464 : : Return -1 otherwise. */
11465 : :
11466 : : widest_int
11467 : 62 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11468 : : {
11469 : 62 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11470 : 62 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11471 : 62 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11472 : :
11473 : : /* Calculate the value that the induction variable must be able
11474 : : to hit in order to ensure that we end the loop with an all-false mask.
11475 : : This involves adding the maximum number of inactive trailing scalar
11476 : : iterations. */
11477 : 62 : widest_int iv_limit = -1;
11478 : 62 : if (max_loop_iterations (loop, &iv_limit))
11479 : : {
11480 : 62 : if (niters_skip)
11481 : : {
11482 : : /* Add the maximum number of skipped iterations to the
11483 : : maximum iteration count. */
11484 : 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
11485 : 0 : iv_limit += wi::to_widest (niters_skip);
11486 : : else
11487 : 0 : iv_limit += max_vf - 1;
11488 : : }
11489 : 62 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11490 : : /* Make a conservatively-correct assumption. */
11491 : 2 : iv_limit += max_vf - 1;
11492 : :
11493 : : /* IV_LIMIT is the maximum number of latch iterations, which is also
11494 : : the maximum in-range IV value. Round this value down to the previous
11495 : : vector alignment boundary and then add an extra full iteration. */
11496 : 62 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11497 : 62 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11498 : : }
11499 : 62 : return iv_limit;
11500 : : }
11501 : :
11502 : : /* For the given rgroup_controls RGC, check whether an induction variable
11503 : : would ever hit a value that produces a set of all-false masks or zero
11504 : : lengths before wrapping around. Return true if it's possible to wrap
11505 : : around before hitting the desirable value, otherwise return false. */
11506 : :
11507 : : bool
11508 : 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11509 : : {
11510 : 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11511 : :
11512 : 0 : if (iv_limit == -1)
11513 : : return true;
11514 : :
11515 : 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11516 : 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
11517 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11518 : :
11519 : 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11520 : : return true;
11521 : :
11522 : : return false;
11523 : 0 : }
|