Branch data Line data Source code
1 : : /* Loop Vectorization
2 : : Copyright (C) 2003-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : : Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #define INCLUDE_ALGORITHM
23 : : #include "config.h"
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "cfghooks.h"
32 : : #include "tree-pass.h"
33 : : #include "ssa.h"
34 : : #include "optabs-tree.h"
35 : : #include "memmodel.h"
36 : : #include "optabs.h"
37 : : #include "diagnostic-core.h"
38 : : #include "fold-const.h"
39 : : #include "stor-layout.h"
40 : : #include "cfganal.h"
41 : : #include "gimplify.h"
42 : : #include "gimple-iterator.h"
43 : : #include "gimplify-me.h"
44 : : #include "tree-ssa-loop-ivopts.h"
45 : : #include "tree-ssa-loop-manip.h"
46 : : #include "tree-ssa-loop-niter.h"
47 : : #include "tree-ssa-loop.h"
48 : : #include "cfgloop.h"
49 : : #include "tree-scalar-evolution.h"
50 : : #include "tree-vectorizer.h"
51 : : #include "gimple-fold.h"
52 : : #include "cgraph.h"
53 : : #include "tree-cfg.h"
54 : : #include "tree-if-conv.h"
55 : : #include "internal-fn.h"
56 : : #include "tree-vector-builder.h"
57 : : #include "vec-perm-indices.h"
58 : : #include "tree-eh.h"
59 : : #include "case-cfn-macros.h"
60 : : #include "langhooks.h"
61 : : #include "opts.h"
62 : :
63 : : /* Loop Vectorization Pass.
64 : :
65 : : This pass tries to vectorize loops.
66 : :
67 : : For example, the vectorizer transforms the following simple loop:
68 : :
69 : : short a[N]; short b[N]; short c[N]; int i;
70 : :
71 : : for (i=0; i<N; i++){
72 : : a[i] = b[i] + c[i];
73 : : }
74 : :
75 : : as if it was manually vectorized by rewriting the source code into:
76 : :
77 : : typedef int __attribute__((mode(V8HI))) v8hi;
78 : : short a[N]; short b[N]; short c[N]; int i;
79 : : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
80 : : v8hi va, vb, vc;
81 : :
82 : : for (i=0; i<N/8; i++){
83 : : vb = pb[i];
84 : : vc = pc[i];
85 : : va = vb + vc;
86 : : pa[i] = va;
87 : : }
88 : :
89 : : The main entry to this pass is vectorize_loops(), in which
90 : : the vectorizer applies a set of analyses on a given set of loops,
91 : : followed by the actual vectorization transformation for the loops that
92 : : had successfully passed the analysis phase.
93 : : Throughout this pass we make a distinction between two types of
94 : : data: scalars (which are represented by SSA_NAMES), and memory references
95 : : ("data-refs"). These two types of data require different handling both
96 : : during analysis and transformation. The types of data-refs that the
97 : : vectorizer currently supports are ARRAY_REFS which base is an array DECL
98 : : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
99 : : accesses are required to have a simple (consecutive) access pattern.
100 : :
101 : : Analysis phase:
102 : : ===============
103 : : The driver for the analysis phase is vect_analyze_loop().
104 : : It applies a set of analyses, some of which rely on the scalar evolution
105 : : analyzer (scev) developed by Sebastian Pop.
106 : :
107 : : During the analysis phase the vectorizer records some information
108 : : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
109 : : loop, as well as general information about the loop as a whole, which is
110 : : recorded in a "loop_vec_info" struct attached to each loop.
111 : :
112 : : Transformation phase:
113 : : =====================
114 : : The loop transformation phase scans all the stmts in the loop, and
115 : : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
116 : : the loop that needs to be vectorized. It inserts the vector code sequence
117 : : just before the scalar stmt S, and records a pointer to the vector code
118 : : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
119 : : attached to S). This pointer will be used for the vectorization of following
120 : : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
121 : : otherwise, we rely on dead code elimination for removing it.
122 : :
123 : : For example, say stmt S1 was vectorized into stmt VS1:
124 : :
125 : : VS1: vb = px[i];
126 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
127 : : S2: a = b;
128 : :
129 : : To vectorize stmt S2, the vectorizer first finds the stmt that defines
130 : : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
131 : : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
132 : : resulting sequence would be:
133 : :
134 : : VS1: vb = px[i];
135 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
136 : : VS2: va = vb;
137 : : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 : :
139 : : Operands that are not SSA_NAMEs, are data-refs that appear in
140 : : load/store operations (like 'x[i]' in S1), and are handled differently.
141 : :
142 : : Target modeling:
143 : : =================
144 : : Currently the only target specific information that is used is the
145 : : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
146 : : Targets that can support different sizes of vectors, for now will need
147 : : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
148 : : flexibility will be added in the future.
149 : :
150 : : Since we only vectorize operations which vector form can be
151 : : expressed using existing tree codes, to verify that an operation is
152 : : supported, the vectorizer checks the relevant optab at the relevant
153 : : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
154 : : the value found is CODE_FOR_nothing, then there's no target support, and
155 : : we can't vectorize the stmt.
156 : :
157 : : For additional information on this project see:
158 : : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
159 : : */
160 : :
161 : : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
162 : : unsigned *);
163 : : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
164 : : gphi **);
165 : :
166 : :
167 : : /* Function vect_is_simple_iv_evolution.
168 : :
169 : : FORNOW: A simple evolution of an induction variables in the loop is
170 : : considered a polynomial evolution. */
171 : :
172 : : static bool
173 : 667176 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn,
174 : : stmt_vec_info stmt_info)
175 : : {
176 : 667176 : tree init_expr;
177 : 667176 : tree step_expr;
178 : 667176 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
179 : 667176 : basic_block bb;
180 : :
181 : : /* When there is no evolution in this loop, the evolution function
182 : : is not "simple". */
183 : 667176 : if (evolution_part == NULL_TREE)
184 : : return false;
185 : :
186 : : /* When the evolution is a polynomial of degree >= 2
187 : : the evolution function is not "simple". */
188 : 709246 : if (tree_is_chrec (evolution_part))
189 : : return false;
190 : :
191 : 617101 : step_expr = evolution_part;
192 : 617101 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
193 : :
194 : 617101 : if (dump_enabled_p ())
195 : 36569 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
196 : : step_expr, init_expr);
197 : :
198 : 617101 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
199 : 617101 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step_expr;
200 : :
201 : 617101 : if (TREE_CODE (step_expr) != INTEGER_CST
202 : 49578 : && (TREE_CODE (step_expr) != SSA_NAME
203 : 41847 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
204 : 41685 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
205 : 7532 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
206 : 111 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
207 : 111 : || !flag_associative_math)))
208 : 659228 : && (TREE_CODE (step_expr) != REAL_CST
209 : 407 : || !flag_associative_math))
210 : : {
211 : 42070 : if (dump_enabled_p ())
212 : 2714 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
213 : : "step unknown.\n");
214 : 42070 : return false;
215 : : }
216 : :
217 : : return true;
218 : : }
219 : :
220 : : /* Function vect_is_nonlinear_iv_evolution
221 : :
222 : : Only support nonlinear induction for integer type
223 : : 1. neg
224 : : 2. mul by constant
225 : : 3. lshift/rshift by constant.
226 : :
227 : : For neg induction, return a fake step as integer -1. */
228 : : static bool
229 : 89804 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
230 : : gphi* loop_phi_node)
231 : : {
232 : 89804 : tree init_expr, ev_expr, result, op1, op2;
233 : 89804 : gimple* def;
234 : :
235 : 89804 : if (gimple_phi_num_args (loop_phi_node) != 2)
236 : : return false;
237 : :
238 : 89804 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
239 : 89804 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
240 : :
241 : : /* Support nonlinear induction only for integer type. */
242 : 89804 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
243 : : return false;
244 : :
245 : 66984 : result = PHI_RESULT (loop_phi_node);
246 : :
247 : 66984 : if (TREE_CODE (ev_expr) != SSA_NAME
248 : 64852 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
249 : 66984 : || !is_gimple_assign (def))
250 : : return false;
251 : :
252 : 59788 : enum tree_code t_code = gimple_assign_rhs_code (def);
253 : 59788 : tree step;
254 : 59788 : switch (t_code)
255 : : {
256 : 1790 : case NEGATE_EXPR:
257 : 1790 : if (gimple_assign_rhs1 (def) != result)
258 : : return false;
259 : 1790 : step = build_int_cst (TREE_TYPE (init_expr), -1);
260 : 1790 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
261 : 1790 : break;
262 : :
263 : 9718 : case RSHIFT_EXPR:
264 : 9718 : case LSHIFT_EXPR:
265 : 9718 : case MULT_EXPR:
266 : 9718 : op1 = gimple_assign_rhs1 (def);
267 : 9718 : op2 = gimple_assign_rhs2 (def);
268 : 9718 : if (TREE_CODE (op2) != INTEGER_CST
269 : 6179 : || op1 != result)
270 : : return false;
271 : 6048 : step = op2;
272 : 6048 : if (t_code == LSHIFT_EXPR)
273 : 193 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
274 : 5855 : else if (t_code == RSHIFT_EXPR)
275 : 5235 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
276 : : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
277 : : else
278 : 620 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
279 : : break;
280 : :
281 : : default:
282 : : return false;
283 : : }
284 : :
285 : 7838 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
286 : 7838 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step;
287 : :
288 : 7838 : return true;
289 : : }
290 : :
291 : : /* Returns true if Phi is a first-order recurrence. A first-order
292 : : recurrence is a non-reduction recurrence relation in which the value of
293 : : the recurrence in the current loop iteration equals a value defined in
294 : : the previous iteration. */
295 : :
296 : : static bool
297 : 21011 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
298 : : gphi *phi)
299 : : {
300 : : /* A nested cycle isn't vectorizable as first order recurrence. */
301 : 21011 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
302 : : return false;
303 : :
304 : : /* Ensure the loop latch definition is from within the loop. */
305 : 20845 : edge latch = loop_latch_edge (loop);
306 : 20845 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
307 : 20845 : if (TREE_CODE (ldef) != SSA_NAME
308 : 18432 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
309 : 18404 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
310 : 38026 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
311 : 3977 : return false;
312 : :
313 : 16868 : tree def = gimple_phi_result (phi);
314 : :
315 : : /* Ensure every use_stmt of the phi node is dominated by the latch
316 : : definition. */
317 : 16868 : imm_use_iterator imm_iter;
318 : 16868 : use_operand_p use_p;
319 : 35964 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
320 : 18604 : if (!is_gimple_debug (USE_STMT (use_p))
321 : 36206 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
322 : 10637 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
323 : : USE_STMT (use_p))))
324 : 16376 : return false;
325 : :
326 : : /* First-order recurrence autovectorization needs shuffle vector. */
327 : 492 : tree scalar_type = TREE_TYPE (def);
328 : 492 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
329 : 492 : if (!vectype)
330 : : return false;
331 : :
332 : : return true;
333 : : }
334 : :
335 : : /* Function vect_analyze_scalar_cycles_1.
336 : :
337 : : Examine the cross iteration def-use cycles of scalar variables
338 : : in LOOP. LOOP_VINFO represents the loop that is now being
339 : : considered for vectorization (can be LOOP, or an outer-loop
340 : : enclosing LOOP). SLP indicates there will be some subsequent
341 : : slp analyses or not. */
342 : :
343 : : static void
344 : 325686 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
345 : : {
346 : 325686 : basic_block bb = loop->header;
347 : 325686 : auto_vec<stmt_vec_info, 64> worklist;
348 : 325686 : gphi_iterator gsi;
349 : :
350 : 325686 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
351 : :
352 : : /* First - identify all inductions. Reduction detection assumes that all the
353 : : inductions have been identified, therefore, this order must not be
354 : : changed. */
355 : 1170374 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
356 : : {
357 : 844688 : gphi *phi = gsi.phi ();
358 : 844688 : tree access_fn = NULL;
359 : 844688 : tree def = PHI_RESULT (phi);
360 : 844688 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
361 : :
362 : : /* Skip virtual phi's. The data dependences that are associated with
363 : : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
364 : 1689376 : if (virtual_operand_p (def))
365 : 261825 : continue;
366 : :
367 : : /* Skip already analyzed inner loop PHIs of double reductions. */
368 : 668069 : if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
369 : 893 : continue;
370 : :
371 : 667176 : if (dump_enabled_p ())
372 : 38451 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
373 : : (gimple *) phi);
374 : :
375 : 667176 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
376 : :
377 : : /* Analyze the evolution function. */
378 : 667176 : access_fn = analyze_scalar_evolution (loop, def);
379 : 667176 : if (dump_enabled_p ())
380 : 38451 : dump_printf_loc (MSG_NOTE, vect_location,
381 : : "Access function of PHI: %T\n", access_fn);
382 : 667176 : if (access_fn)
383 : 667176 : STRIP_NOPS (access_fn);
384 : :
385 : 751489 : if ((!access_fn
386 : 667176 : || !vect_is_simple_iv_evolution (loop->num, access_fn, stmt_vinfo)
387 : 575031 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
388 : 10387 : && (TREE_CODE (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo))
389 : : != INTEGER_CST)))
390 : : /* Only handle nonlinear iv for same loop. */
391 : 759327 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
392 : 89804 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, phi)))
393 : : {
394 : 84313 : worklist.safe_push (stmt_vinfo);
395 : 84313 : continue;
396 : : }
397 : :
398 : 582863 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
399 : : != NULL_TREE);
400 : 582863 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
401 : :
402 : 582863 : if (dump_enabled_p ())
403 : 33947 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
404 : 582863 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
405 : :
406 : : /* Mark if we have a non-linear IV. */
407 : 582863 : LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
408 : 582863 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
409 : : }
410 : :
411 : :
412 : : /* Second - identify all reductions and nested cycles. */
413 : 409999 : while (worklist.length () > 0)
414 : : {
415 : 84313 : stmt_vec_info stmt_vinfo = worklist.pop ();
416 : 84313 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
417 : 84313 : tree def = PHI_RESULT (phi);
418 : :
419 : 84313 : if (dump_enabled_p ())
420 : 4504 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
421 : : (gimple *) phi);
422 : :
423 : 168626 : gcc_assert (!virtual_operand_p (def)
424 : : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
425 : :
426 : 84313 : gphi *double_reduc;
427 : 84313 : stmt_vec_info reduc_stmt_info
428 : 84313 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
429 : 84313 : if (reduc_stmt_info && double_reduc)
430 : : {
431 : 982 : stmt_vec_info inner_phi_info
432 : 982 : = loop_vinfo->lookup_stmt (double_reduc);
433 : : /* ??? Pass down flag we're the inner loop of a double reduc. */
434 : 982 : stmt_vec_info inner_reduc_info
435 : 982 : = vect_is_simple_reduction (loop_vinfo, inner_phi_info, NULL);
436 : 982 : if (inner_reduc_info)
437 : : {
438 : 893 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
439 : 893 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
440 : 893 : STMT_VINFO_REDUC_DEF (inner_phi_info) = inner_reduc_info;
441 : 893 : STMT_VINFO_REDUC_DEF (inner_reduc_info) = inner_phi_info;
442 : 893 : if (dump_enabled_p ())
443 : 120 : dump_printf_loc (MSG_NOTE, vect_location,
444 : : "Detected double reduction.\n");
445 : :
446 : 893 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
447 : 893 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
448 : 893 : STMT_VINFO_DEF_TYPE (inner_phi_info) = vect_nested_cycle;
449 : : /* Make it accessible for SLP vectorization. */
450 : 893 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
451 : : }
452 : 89 : else if (dump_enabled_p ())
453 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
454 : : "Unknown def-use cycle pattern.\n");
455 : : }
456 : 83331 : else if (reduc_stmt_info)
457 : : {
458 : 62320 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
459 : : {
460 : 2181 : if (dump_enabled_p ())
461 : 357 : dump_printf_loc (MSG_NOTE, vect_location,
462 : : "Detected vectorizable nested cycle.\n");
463 : :
464 : 2181 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
465 : : }
466 : : else
467 : : {
468 : 60139 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
469 : 60139 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
470 : 60139 : if (dump_enabled_p ())
471 : 3575 : dump_printf_loc (MSG_NOTE, vect_location,
472 : : "Detected reduction.\n");
473 : :
474 : 60139 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
475 : 60139 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
476 : 60139 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
477 : : }
478 : : }
479 : 21011 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
480 : 486 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
481 : : else
482 : 20525 : if (dump_enabled_p ())
483 : 368 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
484 : : "Unknown def-use cycle pattern.\n");
485 : : }
486 : 325686 : }
487 : :
488 : :
489 : : /* Function vect_analyze_scalar_cycles.
490 : :
491 : : Examine the cross iteration def-use cycles of scalar variables, by
492 : : analyzing the loop-header PHIs of scalar variables. Classify each
493 : : cycle as one of the following: invariant, induction, reduction, unknown.
494 : : We do that for the loop represented by LOOP_VINFO, and also to its
495 : : inner-loop, if exists.
496 : : Examples for scalar cycles:
497 : :
498 : : Example1: reduction:
499 : :
500 : : loop1:
501 : : for (i=0; i<N; i++)
502 : : sum += a[i];
503 : :
504 : : Example2: induction:
505 : :
506 : : loop2:
507 : : for (i=0; i<N; i++)
508 : : a[i] = i; */
509 : :
510 : : static void
511 : 320411 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
512 : : {
513 : 320411 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
514 : :
515 : 320411 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
516 : :
517 : : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
518 : : Reductions in such inner-loop therefore have different properties than
519 : : the reductions in the nest that gets vectorized:
520 : : 1. When vectorized, they are executed in the same order as in the original
521 : : scalar loop, so we can't change the order of computation when
522 : : vectorizing them.
523 : : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
524 : : current checks are too strict. */
525 : :
526 : 320411 : if (loop->inner)
527 : 5275 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
528 : 320411 : }
529 : :
530 : : /* Function vect_get_loop_niters.
531 : :
532 : : Determine how many iterations the loop is executed and place it
533 : : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
534 : : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
535 : : niter information holds in ASSUMPTIONS.
536 : :
537 : : Return the loop exit conditions. */
538 : :
539 : :
540 : : static vec<gcond *>
541 : 265658 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
542 : : tree *number_of_iterations, tree *number_of_iterationsm1)
543 : : {
544 : 265658 : auto_vec<edge> exits = get_loop_exit_edges (loop);
545 : 265658 : vec<gcond *> conds;
546 : 531316 : conds.create (exits.length ());
547 : 265658 : class tree_niter_desc niter_desc;
548 : 265658 : tree niter_assumptions, niter, may_be_zero;
549 : :
550 : 265658 : *assumptions = boolean_true_node;
551 : 265658 : *number_of_iterationsm1 = chrec_dont_know;
552 : 265658 : *number_of_iterations = chrec_dont_know;
553 : :
554 : 265658 : DUMP_VECT_SCOPE ("get_loop_niters");
555 : :
556 : 265658 : if (exits.is_empty ())
557 : 0 : return conds;
558 : :
559 : 265658 : if (dump_enabled_p ())
560 : 13951 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
561 : : exits.length ());
562 : :
563 : : edge exit;
564 : : unsigned int i;
565 : 640932 : FOR_EACH_VEC_ELT (exits, i, exit)
566 : : {
567 : 375274 : gcond *cond = get_loop_exit_condition (exit);
568 : 375274 : if (cond)
569 : 365123 : conds.safe_push (cond);
570 : :
571 : 375274 : if (dump_enabled_p ())
572 : 15010 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
573 : :
574 : 375274 : if (exit != main_exit)
575 : 150473 : continue;
576 : :
577 : 265658 : may_be_zero = NULL_TREE;
578 : 265658 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
579 : 265658 : || chrec_contains_undetermined (niter_desc.niter))
580 : 40857 : continue;
581 : :
582 : 224801 : niter_assumptions = niter_desc.assumptions;
583 : 224801 : may_be_zero = niter_desc.may_be_zero;
584 : 224801 : niter = niter_desc.niter;
585 : :
586 : 224801 : if (may_be_zero && integer_zerop (may_be_zero))
587 : : may_be_zero = NULL_TREE;
588 : :
589 : 9663 : if (may_be_zero)
590 : : {
591 : 9663 : if (COMPARISON_CLASS_P (may_be_zero))
592 : : {
593 : : /* Try to combine may_be_zero with assumptions, this can simplify
594 : : computation of niter expression. */
595 : 9663 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
596 : 1081 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
597 : : niter_assumptions,
598 : : fold_build1 (TRUTH_NOT_EXPR,
599 : : boolean_type_node,
600 : : may_be_zero));
601 : : else
602 : 8582 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
603 : : build_int_cst (TREE_TYPE (niter), 0),
604 : : rewrite_to_non_trapping_overflow (niter));
605 : :
606 : 224801 : may_be_zero = NULL_TREE;
607 : : }
608 : 0 : else if (integer_nonzerop (may_be_zero))
609 : : {
610 : 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
611 : 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
612 : 0 : continue;
613 : : }
614 : : else
615 : 0 : continue;
616 : : }
617 : :
618 : : /* Loop assumptions are based off the normal exit. */
619 : 224801 : *assumptions = niter_assumptions;
620 : 224801 : *number_of_iterationsm1 = niter;
621 : :
622 : : /* We want the number of loop header executions which is the number
623 : : of latch executions plus one.
624 : : ??? For UINT_MAX latch executions this number overflows to zero
625 : : for loops like do { n++; } while (n != 0); */
626 : 224801 : if (niter && !chrec_contains_undetermined (niter))
627 : : {
628 : 224801 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
629 : : unshare_expr (niter),
630 : : build_int_cst (TREE_TYPE (niter), 1));
631 : 224801 : if (TREE_CODE (niter) == INTEGER_CST
632 : 121118 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
633 : : {
634 : : /* If we manage to fold niter + 1 into INTEGER_CST even when
635 : : niter is some complex expression, ensure back
636 : : *number_of_iterationsm1 is an INTEGER_CST as well. See
637 : : PR113210. */
638 : 0 : *number_of_iterationsm1
639 : 0 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
640 : : build_minus_one_cst (TREE_TYPE (niter)));
641 : : }
642 : : }
643 : 224801 : *number_of_iterations = niter;
644 : : }
645 : :
646 : 265658 : if (dump_enabled_p ())
647 : 13951 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
648 : :
649 : 265658 : return conds;
650 : 265658 : }
651 : :
652 : : /* Determine the main loop exit for the vectorizer. */
653 : :
654 : : edge
655 : 499967 : vec_init_loop_exit_info (class loop *loop)
656 : : {
657 : : /* Before we begin we must first determine which exit is the main one and
658 : : which are auxilary exits. */
659 : 499967 : auto_vec<edge> exits = get_loop_exit_edges (loop);
660 : 499967 : if (exits.length () == 1)
661 : 318916 : return exits[0];
662 : :
663 : : /* If we have multiple exits we only support counting IV at the moment.
664 : : Analyze all exits and return the last one we can analyze. */
665 : 181051 : class tree_niter_desc niter_desc;
666 : 181051 : edge candidate = NULL;
667 : 1183908 : for (edge exit : exits)
668 : : {
669 : 650587 : if (!get_loop_exit_condition (exit))
670 : 151443 : continue;
671 : :
672 : 499144 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
673 : 499144 : && !chrec_contains_undetermined (niter_desc.niter))
674 : : {
675 : 137894 : tree may_be_zero = niter_desc.may_be_zero;
676 : 137894 : if ((integer_zerop (may_be_zero)
677 : : /* As we are handling may_be_zero that's not false by
678 : : rewriting niter to may_be_zero ? 0 : niter we require
679 : : an empty latch. */
680 : 661721 : || (single_pred_p (loop->latch)
681 : 10805 : && exit->src == single_pred (loop->latch)
682 : 2690 : && (integer_nonzerop (may_be_zero)
683 : 2690 : || COMPARISON_CLASS_P (may_be_zero))))
684 : 140584 : && (!candidate
685 : 7066 : || dominated_by_p (CDI_DOMINATORS, exit->src,
686 : 7066 : candidate->src)))
687 : : candidate = exit;
688 : : }
689 : : }
690 : :
691 : 181051 : return candidate;
692 : 181051 : }
693 : :
694 : : /* Function bb_in_loop_p
695 : :
696 : : Used as predicate for dfs order traversal of the loop bbs. */
697 : :
698 : : static bool
699 : 1315923 : bb_in_loop_p (const_basic_block bb, const void *data)
700 : : {
701 : 1315923 : const class loop *const loop = (const class loop *)data;
702 : 1315923 : if (flow_bb_inside_loop_p (loop, bb))
703 : : return true;
704 : : return false;
705 : : }
706 : :
707 : :
708 : : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
709 : : stmt_vec_info structs for all the stmts in LOOP_IN. */
710 : :
711 : 415649 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
712 : : : vec_info (vec_info::loop, shared),
713 : 415649 : loop (loop_in),
714 : 415649 : num_itersm1 (NULL_TREE),
715 : 415649 : num_iters (NULL_TREE),
716 : 415649 : num_iters_unchanged (NULL_TREE),
717 : 415649 : num_iters_assumptions (NULL_TREE),
718 : 415649 : vector_costs (nullptr),
719 : 415649 : scalar_costs (nullptr),
720 : 415649 : th (0),
721 : 415649 : versioning_threshold (0),
722 : 415649 : vectorization_factor (0),
723 : 415649 : main_loop_edge (nullptr),
724 : 415649 : skip_main_loop_edge (nullptr),
725 : 415649 : skip_this_loop_edge (nullptr),
726 : 415649 : reusable_accumulators (),
727 : 415649 : suggested_unroll_factor (1),
728 : 415649 : max_vectorization_factor (0),
729 : 415649 : mask_skip_niters (NULL_TREE),
730 : 415649 : mask_skip_niters_pfa_offset (NULL_TREE),
731 : 415649 : rgroup_compare_type (NULL_TREE),
732 : 415649 : simd_if_cond (NULL_TREE),
733 : 415649 : partial_vector_style (vect_partial_vectors_none),
734 : 415649 : unaligned_dr (NULL),
735 : 415649 : peeling_for_alignment (0),
736 : 415649 : ptr_mask (0),
737 : 415649 : max_spec_read_amount (0),
738 : 415649 : nonlinear_iv (false),
739 : 415649 : ivexpr_map (NULL),
740 : 415649 : scan_map (NULL),
741 : 415649 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
742 : 415649 : vectorizable (false),
743 : 415649 : can_use_partial_vectors_p (true),
744 : 415649 : must_use_partial_vectors_p (false),
745 : 415649 : using_partial_vectors_p (false),
746 : 415649 : using_decrementing_iv_p (false),
747 : 415649 : using_select_vl_p (false),
748 : 415649 : allow_mutual_alignment (false),
749 : 415649 : partial_load_store_bias (0),
750 : 415649 : peeling_for_gaps (false),
751 : 415649 : peeling_for_niter (false),
752 : 415649 : early_breaks (false),
753 : 415649 : user_unroll (false),
754 : 415649 : no_data_dependencies (false),
755 : 415649 : has_mask_store (false),
756 : 415649 : scalar_loop_scaling (profile_probability::uninitialized ()),
757 : 415649 : scalar_loop (NULL),
758 : 415649 : main_loop_info (NULL),
759 : 415649 : orig_loop_info (NULL),
760 : 415649 : epilogue_vinfo (NULL),
761 : 415649 : drs_advanced_by (NULL_TREE),
762 : 415649 : vec_loop_iv_exit (NULL),
763 : 415649 : vec_epilogue_loop_iv_exit (NULL),
764 : 415649 : scalar_loop_iv_exit (NULL)
765 : : {
766 : : /* CHECKME: We want to visit all BBs before their successors (except for
767 : : latch blocks, for which this assertion wouldn't hold). In the simple
768 : : case of the loop forms we allow, a dfs order of the BBs would the same
769 : : as reversed postorder traversal, so we are safe. */
770 : :
771 : 415649 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
772 : 831298 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
773 : 415649 : loop->num_nodes, loop);
774 : 415649 : gcc_assert (nbbs == loop->num_nodes);
775 : :
776 : 1501962 : for (unsigned int i = 0; i < nbbs; i++)
777 : : {
778 : 1086313 : basic_block bb = bbs[i];
779 : 1086313 : gimple_stmt_iterator si;
780 : :
781 : 2207279 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
782 : : {
783 : 1120966 : gimple *phi = gsi_stmt (si);
784 : 1120966 : gimple_set_uid (phi, 0);
785 : 1120966 : add_stmt (phi);
786 : : }
787 : :
788 : 9325594 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
789 : : {
790 : 7152968 : gimple *stmt = gsi_stmt (si);
791 : 7152968 : gimple_set_uid (stmt, 0);
792 : 7152968 : if (is_gimple_debug (stmt))
793 : 2712768 : continue;
794 : 4440200 : add_stmt (stmt);
795 : : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
796 : : third argument is the #pragma omp simd if (x) condition, when 0,
797 : : loop shouldn't be vectorized, when non-zero constant, it should
798 : : be vectorized normally, otherwise versioned with vectorized loop
799 : : done if the condition is non-zero at runtime. */
800 : 4440200 : if (loop_in->simduid
801 : 43390 : && is_gimple_call (stmt)
802 : 4268 : && gimple_call_internal_p (stmt)
803 : 4141 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
804 : 4137 : && gimple_call_num_args (stmt) >= 3
805 : 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
806 : 4440303 : && (loop_in->simduid
807 : 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
808 : : {
809 : 103 : tree arg = gimple_call_arg (stmt, 2);
810 : 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
811 : 103 : simd_if_cond = arg;
812 : : else
813 : 0 : gcc_assert (integer_nonzerop (arg));
814 : : }
815 : : }
816 : : }
817 : 415649 : }
818 : :
819 : : /* Free all levels of rgroup CONTROLS. */
820 : :
821 : : void
822 : 1076294 : release_vec_loop_controls (vec<rgroup_controls> *controls)
823 : : {
824 : 1076294 : rgroup_controls *rgc;
825 : 1076294 : unsigned int i;
826 : 1092741 : FOR_EACH_VEC_ELT (*controls, i, rgc)
827 : 16447 : rgc->controls.release ();
828 : 1076294 : controls->release ();
829 : 1076294 : }
830 : :
831 : : /* Free all memory used by the _loop_vec_info, as well as all the
832 : : stmt_vec_info structs of all the stmts in the loop. */
833 : :
834 : 415649 : _loop_vec_info::~_loop_vec_info ()
835 : : {
836 : 415649 : free (bbs);
837 : :
838 : 415649 : release_vec_loop_controls (&masks.rgc_vec);
839 : 415649 : release_vec_loop_controls (&lens);
840 : 419457 : delete ivexpr_map;
841 : 415971 : delete scan_map;
842 : 415649 : delete scalar_costs;
843 : 415649 : delete vector_costs;
844 : 558573 : for (auto reduc_info : reduc_infos)
845 : 139160 : delete reduc_info;
846 : :
847 : : /* When we release an epiloge vinfo that we do not intend to use
848 : : avoid clearing AUX of the main loop which should continue to
849 : : point to the main loop vinfo since otherwise we'll leak that. */
850 : 415649 : if (loop->aux == this)
851 : 60610 : loop->aux = NULL;
852 : 831298 : }
853 : :
854 : : /* Return an invariant or register for EXPR and emit necessary
855 : : computations in the LOOP_VINFO loop preheader. */
856 : :
857 : : tree
858 : 19584 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
859 : : {
860 : 19584 : if (is_gimple_reg (expr)
861 : 19584 : || is_gimple_min_invariant (expr))
862 : 6416 : return expr;
863 : :
864 : 13168 : if (! loop_vinfo->ivexpr_map)
865 : 3808 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
866 : 13168 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
867 : 13168 : if (! cached)
868 : : {
869 : 8548 : gimple_seq stmts = NULL;
870 : 8548 : cached = force_gimple_operand (unshare_expr (expr),
871 : : &stmts, true, NULL_TREE);
872 : 8548 : if (stmts)
873 : : {
874 : 8408 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
875 : 8408 : gsi_insert_seq_on_edge_immediate (e, stmts);
876 : : }
877 : : }
878 : 13168 : return cached;
879 : : }
880 : :
881 : : /* Return true if we can use CMP_TYPE as the comparison type to produce
882 : : all masks required to mask LOOP_VINFO. */
883 : :
884 : : static bool
885 : 73977 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
886 : : {
887 : 73977 : rgroup_controls *rgm;
888 : 73977 : unsigned int i;
889 : 86500 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
890 : 86500 : if (rgm->type != NULL_TREE
891 : 86500 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
892 : : cmp_type, rgm->type,
893 : : OPTIMIZE_FOR_SPEED))
894 : : return false;
895 : : return true;
896 : : }
897 : :
898 : : /* Calculate the maximum number of scalars per iteration for every
899 : : rgroup in LOOP_VINFO. */
900 : :
901 : : static unsigned int
902 : 15648 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
903 : : {
904 : 15648 : unsigned int res = 1;
905 : 15648 : unsigned int i;
906 : 15648 : rgroup_controls *rgm;
907 : 38750 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
908 : 23102 : res = MAX (res, rgm->max_nscalars_per_iter);
909 : 15648 : return res;
910 : : }
911 : :
912 : : /* Calculate the minimum precision necessary to represent:
913 : :
914 : : MAX_NITERS * FACTOR
915 : :
916 : : as an unsigned integer, where MAX_NITERS is the maximum number of
917 : : loop header iterations for the original scalar form of LOOP_VINFO. */
918 : :
919 : : static unsigned
920 : 15648 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
921 : : {
922 : 15648 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
923 : :
924 : : /* Get the maximum number of iterations that is representable
925 : : in the counter type. */
926 : 15648 : tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
927 : 15648 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
928 : :
929 : : /* Get a more refined estimate for the number of iterations. */
930 : 15648 : widest_int max_back_edges;
931 : 15648 : if (max_loop_iterations (loop, &max_back_edges))
932 : 15648 : max_ni = wi::smin (max_ni, max_back_edges + 1);
933 : :
934 : : /* Work out how many bits we need to represent the limit. */
935 : 15648 : return wi::min_precision (max_ni * factor, UNSIGNED);
936 : 15648 : }
937 : :
938 : : /* True if the loop needs peeling or partial vectors when vectorized. */
939 : :
940 : : bool
941 : 119909 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
942 : : {
943 : 119909 : unsigned HOST_WIDE_INT const_vf;
944 : 119909 : HOST_WIDE_INT max_niter
945 : 119909 : = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
946 : :
947 : 119909 : unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
948 : 119909 : if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
949 : 15309 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
950 : : (loop_vinfo));
951 : :
952 : 119909 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
953 : 54881 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
954 : : {
955 : : /* Work out the (constant) number of iterations that need to be
956 : : peeled for reasons other than niters. */
957 : 54843 : unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
958 : 54843 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
959 : 389 : peel_niter += 1;
960 : 118762 : if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
961 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
962 : : return true;
963 : : }
964 : 65066 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
965 : : /* ??? When peeling for gaps but not alignment, we could
966 : : try to check whether the (variable) niters is known to be
967 : : VF * N + 1. That's something of a niche case though. */
968 : 64819 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
969 : 63922 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
970 : 128988 : || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
971 : 127844 : < (unsigned) exact_log2 (const_vf))
972 : : /* In case of versioning, check if the maximum number of
973 : : iterations is greater than th. If they are identical,
974 : : the epilogue is unnecessary. */
975 : 62801 : && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
976 : 4281 : || ((unsigned HOST_WIDE_INT) max_niter
977 : : /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
978 : : but that's only computed later based on our result.
979 : : The following is the most conservative approximation. */
980 : 4281 : > (std::max ((unsigned HOST_WIDE_INT) th,
981 : 4281 : const_vf) / const_vf) * const_vf))))
982 : 63919 : return true;
983 : :
984 : : return false;
985 : : }
986 : :
987 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
988 : : whether we can actually generate the masks required. Return true if so,
989 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
990 : :
991 : : static bool
992 : 15648 : vect_verify_full_masking (loop_vec_info loop_vinfo)
993 : : {
994 : 15648 : unsigned int min_ni_width;
995 : :
996 : : /* Use a normal loop if there are no statements that need masking.
997 : : This only happens in rare degenerate cases: it means that the loop
998 : : has no loads, no stores, and no live-out values. */
999 : 15648 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1000 : : return false;
1001 : :
1002 : : /* Produce the rgroup controls. */
1003 : 61596 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1004 : : {
1005 : 22974 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1006 : 22974 : tree vectype = mask.first;
1007 : 22974 : unsigned nvectors = mask.second;
1008 : :
1009 : 30300 : if (masks->rgc_vec.length () < nvectors)
1010 : 17308 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1011 : 22974 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1012 : : /* The number of scalars per iteration and the number of vectors are
1013 : : both compile-time constants. */
1014 : 22974 : unsigned int nscalars_per_iter
1015 : 22974 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1016 : 22974 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1017 : :
1018 : 22974 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1019 : : {
1020 : 18865 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1021 : 18865 : rgm->type = truth_type_for (vectype);
1022 : 18865 : rgm->factor = 1;
1023 : : }
1024 : : }
1025 : :
1026 : 15648 : unsigned int max_nscalars_per_iter
1027 : 15648 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1028 : :
1029 : : /* Work out how many bits we need to represent the limit. */
1030 : 15648 : min_ni_width
1031 : 15648 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1032 : :
1033 : : /* Find a scalar mode for which WHILE_ULT is supported. */
1034 : 15648 : opt_scalar_int_mode cmp_mode_iter;
1035 : 15648 : tree cmp_type = NULL_TREE;
1036 : 15648 : tree iv_type = NULL_TREE;
1037 : 15648 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1038 : 15648 : unsigned int iv_precision = UINT_MAX;
1039 : :
1040 : 15648 : if (iv_limit != -1)
1041 : 15648 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1042 : : UNSIGNED);
1043 : :
1044 : 125184 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1045 : : {
1046 : 109536 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1047 : 109536 : if (cmp_bits >= min_ni_width
1048 : 109536 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1049 : : {
1050 : 73977 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1051 : 73977 : if (this_type
1052 : 73977 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1053 : : {
1054 : : /* Although we could stop as soon as we find a valid mode,
1055 : : there are at least two reasons why that's not always the
1056 : : best choice:
1057 : :
1058 : : - An IV that's Pmode or wider is more likely to be reusable
1059 : : in address calculations than an IV that's narrower than
1060 : : Pmode.
1061 : :
1062 : : - Doing the comparison in IV_PRECISION or wider allows
1063 : : a natural 0-based IV, whereas using a narrower comparison
1064 : : type requires mitigations against wrap-around.
1065 : :
1066 : : Conversely, if the IV limit is variable, doing the comparison
1067 : : in a wider type than the original type can introduce
1068 : : unnecessary extensions, so picking the widest valid mode
1069 : : is not always a good choice either.
1070 : :
1071 : : Here we prefer the first IV type that's Pmode or wider,
1072 : : and the first comparison type that's IV_PRECISION or wider.
1073 : : (The comparison type must be no wider than the IV type,
1074 : : to avoid extensions in the vector loop.)
1075 : :
1076 : : ??? We might want to try continuing beyond Pmode for ILP32
1077 : : targets if CMP_BITS < IV_PRECISION. */
1078 : 0 : iv_type = this_type;
1079 : 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1080 : : cmp_type = this_type;
1081 : 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1082 : : break;
1083 : : }
1084 : : }
1085 : : }
1086 : :
1087 : 15648 : if (!cmp_type)
1088 : : {
1089 : 15648 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1090 : 15648 : return false;
1091 : : }
1092 : :
1093 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1094 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1095 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1096 : 0 : return true;
1097 : 15648 : }
1098 : :
1099 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1100 : : whether we can actually generate AVX512 style masks. Return true if so,
1101 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1102 : :
1103 : : static bool
1104 : 15648 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1105 : : {
1106 : : /* Produce differently organized rgc_vec and differently check
1107 : : we can produce masks. */
1108 : :
1109 : : /* Use a normal loop if there are no statements that need masking.
1110 : : This only happens in rare degenerate cases: it means that the loop
1111 : : has no loads, no stores, and no live-out values. */
1112 : 15648 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1113 : : return false;
1114 : :
1115 : : /* For the decrementing IV we need to represent all values in
1116 : : [0, niter + niter_skip] where niter_skip is the elements we
1117 : : skip in the first iteration for prologue peeling. */
1118 : 15648 : tree iv_type = NULL_TREE;
1119 : 15648 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1120 : 15648 : unsigned int iv_precision = UINT_MAX;
1121 : 15648 : if (iv_limit != -1)
1122 : 15648 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1123 : :
1124 : : /* First compute the type for the IV we use to track the remaining
1125 : : scalar iterations. */
1126 : 15648 : opt_scalar_int_mode cmp_mode_iter;
1127 : 19895 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1128 : : {
1129 : 19895 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1130 : 19895 : if (cmp_bits >= iv_precision
1131 : 19895 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1132 : : {
1133 : 15648 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1134 : 15648 : if (iv_type)
1135 : : break;
1136 : : }
1137 : : }
1138 : 15648 : if (!iv_type)
1139 : : return false;
1140 : :
1141 : : /* Produce the rgroup controls. */
1142 : 61596 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1143 : : {
1144 : 22974 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1145 : 22974 : tree vectype = mask.first;
1146 : 22974 : unsigned nvectors = mask.second;
1147 : :
1148 : : /* The number of scalars per iteration and the number of vectors are
1149 : : both compile-time constants. */
1150 : 22974 : unsigned int nscalars_per_iter
1151 : 22974 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1152 : 22974 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1153 : :
1154 : : /* We index the rgroup_controls vector with nscalars_per_iter
1155 : : which we keep constant and instead have a varying nvectors,
1156 : : remembering the vector mask with the fewest nV. */
1157 : 30300 : if (masks->rgc_vec.length () < nscalars_per_iter)
1158 : 15705 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1159 : 22974 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1160 : :
1161 : 22974 : if (!rgm->type || rgm->factor > nvectors)
1162 : : {
1163 : 17257 : rgm->type = truth_type_for (vectype);
1164 : 17257 : rgm->compare_type = NULL_TREE;
1165 : 17257 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1166 : 17257 : rgm->factor = nvectors;
1167 : 17257 : rgm->bias_adjusted_ctrl = NULL_TREE;
1168 : : }
1169 : : }
1170 : :
1171 : : /* There is no fixed compare type we are going to use but we have to
1172 : : be able to get at one for each mask group. */
1173 : 15648 : unsigned int min_ni_width
1174 : 15648 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1175 : :
1176 : 15648 : bool ok = true;
1177 : 60241 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1178 : : {
1179 : 16405 : tree mask_type = rgc.type;
1180 : 16405 : if (!mask_type)
1181 : 681 : continue;
1182 : :
1183 : : /* For now vect_get_loop_mask only supports integer mode masks
1184 : : when we need to split it. */
1185 : 15724 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1186 : 15724 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1187 : : {
1188 : : ok = false;
1189 : : break;
1190 : : }
1191 : :
1192 : : /* If iv_type is usable as compare type use that - we can elide the
1193 : : saturation in that case. */
1194 : 12620 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1195 : : {
1196 : 12620 : tree cmp_vectype
1197 : 12620 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1198 : 12620 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1199 : 4463 : rgc.compare_type = cmp_vectype;
1200 : : }
1201 : 12620 : if (!rgc.compare_type)
1202 : 23847 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1203 : : {
1204 : 23843 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1205 : 23843 : if (cmp_bits >= min_ni_width
1206 : 23843 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1207 : : {
1208 : 23831 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1209 : 23831 : if (!cmp_type)
1210 : 0 : continue;
1211 : :
1212 : : /* Check whether we can produce the mask with cmp_type. */
1213 : 23831 : tree cmp_vectype
1214 : 23831 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1215 : 23831 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1216 : : {
1217 : 8153 : rgc.compare_type = cmp_vectype;
1218 : 8153 : break;
1219 : : }
1220 : : }
1221 : : }
1222 : 12620 : if (!rgc.compare_type)
1223 : : {
1224 : : ok = false;
1225 : : break;
1226 : : }
1227 : : }
1228 : 15648 : if (!ok)
1229 : : {
1230 : 3108 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1231 : 3108 : return false;
1232 : : }
1233 : :
1234 : 12540 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1235 : 12540 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1236 : 12540 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1237 : 12540 : return true;
1238 : 15648 : }
1239 : :
1240 : : /* Check whether we can use vector access with length based on precison
1241 : : comparison. So far, to keep it simple, we only allow the case that the
1242 : : precision of the target supported length is larger than the precision
1243 : : required by loop niters. */
1244 : :
1245 : : static bool
1246 : 6 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1247 : : {
1248 : 6 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1249 : : return false;
1250 : :
1251 : 0 : if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
1252 : : return false;
1253 : :
1254 : 0 : machine_mode len_load_mode, len_store_mode;
1255 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1256 : 0 : .exists (&len_load_mode))
1257 : 0 : return false;
1258 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1259 : 0 : .exists (&len_store_mode))
1260 : 0 : return false;
1261 : :
1262 : 0 : signed char partial_load_bias = internal_len_load_store_bias
1263 : 0 : (IFN_LEN_LOAD, len_load_mode);
1264 : :
1265 : 0 : signed char partial_store_bias = internal_len_load_store_bias
1266 : 0 : (IFN_LEN_STORE, len_store_mode);
1267 : :
1268 : 0 : gcc_assert (partial_load_bias == partial_store_bias);
1269 : :
1270 : 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1271 : : return false;
1272 : :
1273 : : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1274 : : len_loads with a length of zero. In order to avoid that we prohibit
1275 : : more than one loop length here. */
1276 : 0 : if (partial_load_bias == -1
1277 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1278 : : return false;
1279 : :
1280 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1281 : :
1282 : 0 : unsigned int max_nitems_per_iter = 1;
1283 : 0 : unsigned int i;
1284 : 0 : rgroup_controls *rgl;
1285 : : /* Find the maximum number of items per iteration for every rgroup. */
1286 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1287 : : {
1288 : 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1289 : 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1290 : : }
1291 : :
1292 : : /* Work out how many bits we need to represent the length limit. */
1293 : 0 : unsigned int min_ni_prec
1294 : 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1295 : :
1296 : : /* Now use the maximum of below precisions for one suitable IV type:
1297 : : - the IV's natural precision
1298 : : - the precision needed to hold: the maximum number of scalar
1299 : : iterations multiplied by the scale factor (min_ni_prec above)
1300 : : - the Pmode precision
1301 : :
1302 : : If min_ni_prec is less than the precision of the current niters,
1303 : : we perfer to still use the niters type. Prefer to use Pmode and
1304 : : wider IV to avoid narrow conversions. */
1305 : :
1306 : 0 : unsigned int ni_prec
1307 : 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1308 : 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1309 : 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1310 : :
1311 : 0 : tree iv_type = NULL_TREE;
1312 : 0 : opt_scalar_int_mode tmode_iter;
1313 : 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1314 : : {
1315 : 0 : scalar_mode tmode = tmode_iter.require ();
1316 : 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1317 : :
1318 : : /* ??? Do we really want to construct one IV whose precision exceeds
1319 : : BITS_PER_WORD? */
1320 : 0 : if (tbits > BITS_PER_WORD)
1321 : : break;
1322 : :
1323 : : /* Find the first available standard integral type. */
1324 : 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1325 : : {
1326 : 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1327 : 0 : break;
1328 : : }
1329 : : }
1330 : :
1331 : 0 : if (!iv_type)
1332 : : {
1333 : 0 : if (dump_enabled_p ())
1334 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1335 : : "can't vectorize with length-based partial vectors"
1336 : : " because there is no suitable iv type.\n");
1337 : 0 : return false;
1338 : : }
1339 : :
1340 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1341 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1342 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1343 : :
1344 : 0 : return true;
1345 : : }
1346 : :
1347 : : /* Calculate the cost of one scalar iteration of the loop. */
1348 : : static void
1349 : 285495 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1350 : : {
1351 : 285495 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1352 : 285495 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1353 : 285495 : int nbbs = loop->num_nodes, factor;
1354 : 285495 : int innerloop_iters, i;
1355 : :
1356 : 285495 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1357 : :
1358 : : /* Gather costs for statements in the scalar loop. */
1359 : :
1360 : : /* FORNOW. */
1361 : 285495 : innerloop_iters = 1;
1362 : 285495 : if (loop->inner)
1363 : 1261 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1364 : :
1365 : 1015567 : for (i = 0; i < nbbs; i++)
1366 : : {
1367 : 730072 : gimple_stmt_iterator si;
1368 : 730072 : basic_block bb = bbs[i];
1369 : :
1370 : 730072 : if (bb->loop_father == loop->inner)
1371 : : factor = innerloop_iters;
1372 : : else
1373 : 727550 : factor = 1;
1374 : :
1375 : 5818886 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1376 : : {
1377 : 4358742 : gimple *stmt = gsi_stmt (si);
1378 : 4358742 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1379 : :
1380 : 4358742 : if (!is_gimple_assign (stmt)
1381 : : && !is_gimple_call (stmt)
1382 : : && !is_a<gcond *> (stmt))
1383 : 1579602 : continue;
1384 : :
1385 : : /* Skip stmts that are not vectorized inside the loop. */
1386 : 2779140 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1387 : 2779140 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1388 : 1182299 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1389 : 59 : || !VECTORIZABLE_CYCLE_DEF
1390 : : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1391 : 1182299 : continue;
1392 : :
1393 : 1596841 : vect_cost_for_stmt kind;
1394 : 1596841 : if (STMT_VINFO_DATA_REF (stmt_info))
1395 : : {
1396 : 676757 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1397 : : kind = scalar_load;
1398 : : else
1399 : 240072 : kind = scalar_store;
1400 : : }
1401 : 920084 : else if (vect_nop_conversion_p (stmt_info))
1402 : 40764 : continue;
1403 : : else
1404 : : kind = scalar_stmt;
1405 : :
1406 : : /* We are using vect_prologue here to avoid scaling twice
1407 : : by the inner loop factor. */
1408 : 1556077 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1409 : : factor, kind, stmt_info, 0, vect_prologue);
1410 : : }
1411 : : }
1412 : :
1413 : : /* Now accumulate cost. */
1414 : 285495 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1415 : 285495 : add_stmt_costs (loop_vinfo->scalar_costs,
1416 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1417 : 285495 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1418 : 285495 : }
1419 : :
1420 : : /* Function vect_analyze_loop_form.
1421 : :
1422 : : Verify that certain CFG restrictions hold, including:
1423 : : - the loop has a pre-header
1424 : : - the loop has a single entry
1425 : : - nested loops can have only a single exit.
1426 : : - the loop exit condition is simple enough
1427 : : - the number of iterations can be analyzed, i.e, a countable loop. The
1428 : : niter could be analyzed under some assumptions. */
1429 : :
1430 : : opt_result
1431 : 466008 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1432 : : vect_loop_form_info *info)
1433 : : {
1434 : 466008 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1435 : :
1436 : 466008 : edge exit_e = vec_init_loop_exit_info (loop);
1437 : 466008 : if (!exit_e)
1438 : 58667 : return opt_result::failure_at (vect_location,
1439 : : "not vectorized:"
1440 : : " could not determine main exit from"
1441 : : " loop with multiple exits.\n");
1442 : 407341 : if (loop_vectorized_call)
1443 : : {
1444 : 26236 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1445 : 26236 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1446 : 26236 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1447 : 26236 : if (!scalar_exit_e)
1448 : 0 : return opt_result::failure_at (vect_location,
1449 : : "not vectorized:"
1450 : : " could not determine main exit from"
1451 : : " loop with multiple exits.\n");
1452 : : }
1453 : :
1454 : 407341 : info->loop_exit = exit_e;
1455 : 407341 : if (dump_enabled_p ())
1456 : 15261 : dump_printf_loc (MSG_NOTE, vect_location,
1457 : : "using as main loop exit: %d -> %d [AUX: %p]\n",
1458 : 15261 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1459 : :
1460 : : /* Check if we have any control flow that doesn't leave the loop. */
1461 : 407341 : basic_block *bbs = get_loop_body (loop);
1462 : 1364116 : for (unsigned i = 0; i < loop->num_nodes; i++)
1463 : 1065225 : if (EDGE_COUNT (bbs[i]->succs) != 1
1464 : 1065225 : && (EDGE_COUNT (bbs[i]->succs) != 2
1465 : 630104 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1466 : : {
1467 : 108450 : free (bbs);
1468 : 108450 : return opt_result::failure_at (vect_location,
1469 : : "not vectorized:"
1470 : : " unsupported control flow in loop.\n");
1471 : : }
1472 : 298891 : free (bbs);
1473 : :
1474 : : /* Different restrictions apply when we are considering an inner-most loop,
1475 : : vs. an outer (nested) loop.
1476 : : (FORNOW. May want to relax some of these restrictions in the future). */
1477 : :
1478 : 298891 : info->inner_loop_cond = NULL;
1479 : 298891 : if (!loop->inner)
1480 : : {
1481 : : /* Inner-most loop. */
1482 : :
1483 : 277379 : if (empty_block_p (loop->header))
1484 : 3 : return opt_result::failure_at (vect_location,
1485 : : "not vectorized: empty loop.\n");
1486 : : }
1487 : : else
1488 : : {
1489 : 21512 : class loop *innerloop = loop->inner;
1490 : 21512 : edge entryedge;
1491 : :
1492 : : /* Nested loop. We currently require that the loop is doubly-nested,
1493 : : contains a single inner loop with a single exit to the block
1494 : : with the single exit condition in the outer loop.
1495 : : Vectorizable outer-loops look like this:
1496 : :
1497 : : (pre-header)
1498 : : |
1499 : : header <---+
1500 : : | |
1501 : : inner-loop |
1502 : : | |
1503 : : tail ------+
1504 : : |
1505 : : (exit-bb)
1506 : :
1507 : : The inner-loop also has the properties expected of inner-most loops
1508 : : as described above. */
1509 : :
1510 : 21512 : if ((loop->inner)->inner || (loop->inner)->next)
1511 : 2977 : return opt_result::failure_at (vect_location,
1512 : : "not vectorized:"
1513 : : " multiple nested loops.\n");
1514 : :
1515 : 18535 : entryedge = loop_preheader_edge (innerloop);
1516 : 18535 : if (entryedge->src != loop->header
1517 : 18185 : || !single_exit (innerloop)
1518 : 29735 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1519 : 7620 : return opt_result::failure_at (vect_location,
1520 : : "not vectorized:"
1521 : : " unsupported outerloop form.\n");
1522 : :
1523 : : /* Analyze the inner-loop. */
1524 : 10915 : vect_loop_form_info inner;
1525 : 10915 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1526 : 10915 : if (!res)
1527 : : {
1528 : 1182 : if (dump_enabled_p ())
1529 : 5 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530 : : "not vectorized: Bad inner loop.\n");
1531 : 1182 : return res;
1532 : : }
1533 : :
1534 : : /* Don't support analyzing niter under assumptions for inner
1535 : : loop. */
1536 : 9733 : if (!integer_onep (inner.assumptions))
1537 : 283 : return opt_result::failure_at (vect_location,
1538 : : "not vectorized: Bad inner loop.\n");
1539 : :
1540 : 9450 : if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1541 : 1086 : return opt_result::failure_at (vect_location,
1542 : : "not vectorized: inner-loop count not"
1543 : : " invariant.\n");
1544 : :
1545 : 8364 : if (dump_enabled_p ())
1546 : 946 : dump_printf_loc (MSG_NOTE, vect_location,
1547 : : "Considering outer-loop vectorization.\n");
1548 : 8364 : info->inner_loop_cond = inner.conds[0];
1549 : 10915 : }
1550 : :
1551 : 285740 : if (EDGE_COUNT (loop->header->preds) != 2)
1552 : 0 : return opt_result::failure_at (vect_location,
1553 : : "not vectorized:"
1554 : : " too many incoming edges.\n");
1555 : :
1556 : : /* We assume that the latch is empty. */
1557 : 285740 : basic_block latch = loop->latch;
1558 : 285740 : do
1559 : : {
1560 : 285740 : if (!empty_block_p (latch)
1561 : 285740 : || !gimple_seq_empty_p (phi_nodes (latch)))
1562 : 20038 : return opt_result::failure_at (vect_location,
1563 : : "not vectorized: latch block not "
1564 : : "empty.\n");
1565 : 265702 : latch = single_pred (latch);
1566 : : }
1567 : 531404 : while (single_succ_p (latch));
1568 : :
1569 : : /* Make sure there is no abnormal exit. */
1570 : 265702 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1571 : 1172391 : for (edge e : exits)
1572 : : {
1573 : 375329 : if (e->flags & EDGE_ABNORMAL)
1574 : 44 : return opt_result::failure_at (vect_location,
1575 : : "not vectorized:"
1576 : : " abnormal loop exit edge.\n");
1577 : : }
1578 : :
1579 : 265658 : info->conds
1580 : 265658 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1581 : : &info->number_of_iterations,
1582 : 265658 : &info->number_of_iterationsm1);
1583 : 265658 : if (info->conds.is_empty ())
1584 : 36 : return opt_result::failure_at
1585 : 36 : (vect_location,
1586 : : "not vectorized: complicated exit condition.\n");
1587 : :
1588 : : /* Determine what the primary and alternate exit conds are. */
1589 : 630745 : for (unsigned i = 0; i < info->conds.length (); i++)
1590 : : {
1591 : 365123 : gcond *cond = info->conds[i];
1592 : 365123 : if (exit_e->src == gimple_bb (cond))
1593 : 265622 : std::swap (info->conds[0], info->conds[i]);
1594 : : }
1595 : :
1596 : 265622 : if (integer_zerop (info->assumptions)
1597 : 265622 : || !info->number_of_iterations
1598 : 531244 : || chrec_contains_undetermined (info->number_of_iterations))
1599 : 40821 : return opt_result::failure_at
1600 : 40821 : (info->conds[0],
1601 : : "not vectorized: number of iterations cannot be computed.\n");
1602 : :
1603 : 224801 : if (integer_zerop (info->number_of_iterations))
1604 : 14 : return opt_result::failure_at
1605 : 14 : (info->conds[0],
1606 : : "not vectorized: number of iterations = 0.\n");
1607 : :
1608 : 224787 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1609 : 121097 : && tree_to_shwi (info->number_of_iterations) > 0))
1610 : : {
1611 : 103690 : if (dump_enabled_p ())
1612 : : {
1613 : 2401 : dump_printf_loc (MSG_NOTE, vect_location,
1614 : : "Symbolic number of iterations is ");
1615 : 2401 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1616 : 2401 : dump_printf (MSG_NOTE, "\n");
1617 : : }
1618 : : }
1619 : :
1620 : 224787 : if (!integer_onep (info->assumptions))
1621 : : {
1622 : 10338 : if (dump_enabled_p ())
1623 : : {
1624 : 63 : dump_printf_loc (MSG_NOTE, vect_location,
1625 : : "Loop to be versioned with niter assumption ");
1626 : 63 : dump_generic_expr (MSG_NOTE, TDF_SLIM, info->assumptions);
1627 : 63 : dump_printf (MSG_NOTE, "\n");
1628 : : }
1629 : : }
1630 : :
1631 : 224787 : return opt_result::success ();
1632 : 265702 : }
1633 : :
1634 : : /* Create a loop_vec_info for LOOP with SHARED and the
1635 : : vect_analyze_loop_form result. */
1636 : :
1637 : : loop_vec_info
1638 : 415649 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1639 : : const vect_loop_form_info *info,
1640 : : loop_vec_info orig_loop_info)
1641 : : {
1642 : 415649 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1643 : 415649 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1644 : 415649 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1645 : 415649 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1646 : 415649 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1647 : 415649 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1648 : 171 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1649 : 171 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1650 : : else
1651 : 415478 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1652 : : /* Also record the assumptions for versioning. */
1653 : 415649 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1654 : 20574 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1655 : :
1656 : 1882091 : for (gcond *cond : info->conds)
1657 : : {
1658 : 635144 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1659 : : /* Mark the statement as a condition. */
1660 : 635144 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1661 : : }
1662 : :
1663 : 635144 : for (unsigned i = 1; i < info->conds.length (); i ++)
1664 : 219495 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1665 : 415649 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1666 : :
1667 : 415649 : LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1668 : :
1669 : : /* Check to see if we're vectorizing multiple exits. */
1670 : 415649 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1671 : 415649 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1672 : :
1673 : 415649 : if (info->inner_loop_cond)
1674 : : {
1675 : : /* If we have an estimate on the number of iterations of the inner
1676 : : loop use that to limit the scale for costing, otherwise use
1677 : : --param vect-inner-loop-cost-factor literally. */
1678 : 8461 : widest_int nit;
1679 : 8461 : if (estimated_stmt_executions (loop->inner, &nit))
1680 : 7214 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1681 : 7214 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1682 : 8461 : }
1683 : :
1684 : 415649 : return loop_vinfo;
1685 : : }
1686 : :
1687 : :
1688 : :
1689 : : /* Return true if we know that the iteration count is smaller than the
1690 : : vectorization factor. Return false if it isn't, or if we can't be sure
1691 : : either way. */
1692 : :
1693 : : static bool
1694 : 112272 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1695 : : {
1696 : 112272 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1697 : :
1698 : 112272 : HOST_WIDE_INT max_niter;
1699 : 112272 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1700 : 53487 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1701 : : else
1702 : 58785 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1703 : :
1704 : 112272 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1705 : 10616 : return true;
1706 : :
1707 : : return false;
1708 : : }
1709 : :
1710 : : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1711 : : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1712 : : definitely no, or -1 if it's worth retrying. */
1713 : :
1714 : : static int
1715 : 112280 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1716 : : unsigned *suggested_unroll_factor)
1717 : : {
1718 : 112280 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1719 : 112280 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1720 : :
1721 : : /* Only loops that can handle partially-populated vectors can have iteration
1722 : : counts less than the vectorization factor. */
1723 : 112280 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
1724 : 112280 : && vect_known_niters_smaller_than_vf (loop_vinfo))
1725 : : {
1726 : 10606 : if (dump_enabled_p ())
1727 : 236 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1728 : : "not vectorized: iteration count smaller than "
1729 : : "vectorization factor.\n");
1730 : 10606 : return 0;
1731 : : }
1732 : :
1733 : : /* If we know the number of iterations we can do better, for the
1734 : : epilogue we can also decide whether the main loop leaves us
1735 : : with enough iterations, prefering a smaller vector epilog then
1736 : : also possibly used for the case we skip the vector loop. */
1737 : 101674 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1738 : : {
1739 : 44109 : widest_int scalar_niters
1740 : 44109 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
1741 : 44109 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1742 : : {
1743 : 2671 : loop_vec_info orig_loop_vinfo
1744 : : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1745 : 2671 : loop_vec_info main_loop_vinfo
1746 : : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
1747 : 2671 : unsigned lowest_vf
1748 : 2671 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
1749 : 2671 : int prolog_peeling = 0;
1750 : 2671 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
1751 : 2671 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
1752 : 2671 : if (prolog_peeling >= 0
1753 : 2671 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
1754 : : lowest_vf))
1755 : : {
1756 : 5332 : unsigned gap
1757 : 2666 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
1758 : 5332 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
1759 : 5332 : % lowest_vf + gap);
1760 : : }
1761 : : }
1762 : : /* Reject vectorizing for a single scalar iteration, even if
1763 : : we could in principle implement that using partial vectors. */
1764 : 44109 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
1765 : 44109 : if (scalar_niters <= peeling_gap + 1)
1766 : : {
1767 : 784 : if (dump_enabled_p ())
1768 : 168 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1769 : : "not vectorized: loop only has a single "
1770 : : "scalar iteration.\n");
1771 : 784 : return 0;
1772 : : }
1773 : :
1774 : 43325 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1775 : : {
1776 : : /* Check that the loop processes at least one full vector. */
1777 : 43314 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1778 : 43314 : if (known_lt (scalar_niters, vf))
1779 : : {
1780 : 361 : if (dump_enabled_p ())
1781 : 293 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1782 : : "loop does not have enough iterations "
1783 : : "to support vectorization.\n");
1784 : 401 : return 0;
1785 : : }
1786 : :
1787 : : /* If we need to peel an extra epilogue iteration to handle data
1788 : : accesses with gaps, check that there are enough scalar iterations
1789 : : available.
1790 : :
1791 : : The check above is redundant with this one when peeling for gaps,
1792 : : but the distinction is useful for diagnostics. */
1793 : 42953 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1794 : 43243 : && known_le (scalar_niters, vf))
1795 : : {
1796 : 40 : if (dump_enabled_p ())
1797 : 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1798 : : "loop does not have enough iterations "
1799 : : "to support peeling for gaps.\n");
1800 : 40 : return 0;
1801 : : }
1802 : : }
1803 : 44109 : }
1804 : :
1805 : : /* If using the "very cheap" model. reject cases in which we'd keep
1806 : : a copy of the scalar code (even if we might be able to vectorize it). */
1807 : 100489 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1808 : 100489 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1809 : 50658 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
1810 : : {
1811 : 721 : if (dump_enabled_p ())
1812 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1813 : : "some scalar iterations would need to be peeled\n");
1814 : 721 : return 0;
1815 : : }
1816 : :
1817 : 99768 : int min_profitable_iters, min_profitable_estimate;
1818 : 99768 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1819 : : &min_profitable_estimate,
1820 : : suggested_unroll_factor);
1821 : :
1822 : 99768 : if (min_profitable_iters < 0)
1823 : : {
1824 : 25793 : if (dump_enabled_p ())
1825 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826 : : "not vectorized: vectorization not profitable.\n");
1827 : 25793 : if (dump_enabled_p ())
1828 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1829 : : "not vectorized: vector version will never be "
1830 : : "profitable.\n");
1831 : 25793 : return -1;
1832 : : }
1833 : :
1834 : 73975 : int min_scalar_loop_bound = (param_min_vect_loop_bound
1835 : 73975 : * assumed_vf);
1836 : :
1837 : : /* Use the cost model only if it is more conservative than user specified
1838 : : threshold. */
1839 : 73975 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1840 : : min_profitable_iters);
1841 : :
1842 : 73975 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1843 : :
1844 : 37751 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1845 : 111726 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1846 : : {
1847 : 410 : if (dump_enabled_p ())
1848 : 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1849 : : "not vectorized: vectorization not profitable.\n");
1850 : 410 : if (dump_enabled_p ())
1851 : 1 : dump_printf_loc (MSG_NOTE, vect_location,
1852 : : "not vectorized: iteration count smaller than user "
1853 : : "specified loop bound parameter or minimum profitable "
1854 : : "iterations (whichever is more conservative).\n");
1855 : 410 : return 0;
1856 : : }
1857 : :
1858 : : /* The static profitablity threshold min_profitable_estimate includes
1859 : : the cost of having to check at runtime whether the scalar loop
1860 : : should be used instead. If it turns out that we don't need or want
1861 : : such a check, the threshold we should use for the static estimate
1862 : : is simply the point at which the vector loop becomes more profitable
1863 : : than the scalar loop. */
1864 : 73565 : if (min_profitable_estimate > min_profitable_iters
1865 : 15471 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1866 : 15001 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1867 : 297 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1868 : 73862 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1869 : : {
1870 : 11 : if (dump_enabled_p ())
1871 : 6 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1872 : : " choice between the scalar and vector loops\n");
1873 : 11 : min_profitable_estimate = min_profitable_iters;
1874 : : }
1875 : :
1876 : : /* If the vector loop needs multiple iterations to be beneficial then
1877 : : things are probably too close to call, and the conservative thing
1878 : : would be to stick with the scalar code. */
1879 : 73565 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1880 : 73565 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1881 : : {
1882 : 8402 : if (dump_enabled_p ())
1883 : 177 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1884 : : "one iteration of the vector loop would be"
1885 : : " more expensive than the equivalent number of"
1886 : : " iterations of the scalar loop\n");
1887 : 8402 : return 0;
1888 : : }
1889 : :
1890 : 65163 : HOST_WIDE_INT estimated_niter;
1891 : :
1892 : : /* If we are vectorizing an epilogue then we know the maximum number of
1893 : : scalar iterations it will cover is at least one lower than the
1894 : : vectorization factor of the main loop. */
1895 : 65163 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1896 : 10716 : estimated_niter
1897 : 10716 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1898 : : else
1899 : : {
1900 : 54447 : estimated_niter = estimated_stmt_executions_int (loop);
1901 : 54447 : if (estimated_niter == -1)
1902 : 20474 : estimated_niter = likely_max_stmt_executions_int (loop);
1903 : : }
1904 : 31190 : if (estimated_niter != -1
1905 : 63550 : && ((unsigned HOST_WIDE_INT) estimated_niter
1906 : 63550 : < MAX (th, (unsigned) min_profitable_estimate)))
1907 : : {
1908 : 4345 : if (dump_enabled_p ())
1909 : 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910 : : "not vectorized: estimated iteration count too "
1911 : : "small.\n");
1912 : 4345 : if (dump_enabled_p ())
1913 : 28 : dump_printf_loc (MSG_NOTE, vect_location,
1914 : : "not vectorized: estimated iteration count smaller "
1915 : : "than specified loop bound parameter or minimum "
1916 : : "profitable iterations (whichever is more "
1917 : : "conservative).\n");
1918 : 4345 : return -1;
1919 : : }
1920 : :
1921 : : return 1;
1922 : : }
1923 : :
1924 : : static opt_result
1925 : 222245 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1926 : : vec<data_reference_p> *datarefs)
1927 : : {
1928 : 680330 : for (unsigned i = 0; i < loop->num_nodes; i++)
1929 : 1005730 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1930 : 3783992 : !gsi_end_p (gsi); gsi_next (&gsi))
1931 : : {
1932 : 3325907 : gimple *stmt = gsi_stmt (gsi);
1933 : 3325907 : if (is_gimple_debug (stmt))
1934 : 1221613 : continue;
1935 : 2104422 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1936 : : NULL, 0);
1937 : 2104422 : if (!res)
1938 : : {
1939 : 44908 : if (is_gimple_call (stmt) && loop->safelen)
1940 : : {
1941 : 398 : tree fndecl = gimple_call_fndecl (stmt), op;
1942 : 398 : if (fndecl == NULL_TREE
1943 : 398 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
1944 : : {
1945 : 0 : fndecl = gimple_call_arg (stmt, 0);
1946 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
1947 : 0 : fndecl = TREE_OPERAND (fndecl, 0);
1948 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
1949 : : }
1950 : 398 : if (fndecl != NULL_TREE)
1951 : : {
1952 : 361 : cgraph_node *node = cgraph_node::get (fndecl);
1953 : 361 : if (node != NULL && node->simd_clones != NULL)
1954 : : {
1955 : 129 : unsigned int j, n = gimple_call_num_args (stmt);
1956 : 539 : for (j = 0; j < n; j++)
1957 : : {
1958 : 282 : op = gimple_call_arg (stmt, j);
1959 : 282 : if (DECL_P (op)
1960 : 282 : || (REFERENCE_CLASS_P (op)
1961 : 0 : && get_base_address (op)))
1962 : : break;
1963 : : }
1964 : 129 : op = gimple_call_lhs (stmt);
1965 : : /* Ignore #pragma omp declare simd functions
1966 : : if they don't have data references in the
1967 : : call stmt itself. */
1968 : 257 : if (j == n
1969 : 129 : && !(op
1970 : 118 : && (DECL_P (op)
1971 : 118 : || (REFERENCE_CLASS_P (op)
1972 : 0 : && get_base_address (op)))))
1973 : 128 : continue;
1974 : : }
1975 : : }
1976 : : }
1977 : 44780 : return res;
1978 : : }
1979 : : /* If dependence analysis will give up due to the limit on the
1980 : : number of datarefs stop here and fail fatally. */
1981 : 3615914 : if (datarefs->length ()
1982 : 1556400 : > (unsigned)param_loop_max_datarefs_for_datadeps)
1983 : 0 : return opt_result::failure_at (stmt, "exceeded param "
1984 : : "loop-max-datarefs-for-datadeps\n");
1985 : : }
1986 : 177465 : return opt_result::success ();
1987 : : }
1988 : :
1989 : : /* Determine if operating on full vectors for LOOP_VINFO might leave
1990 : : some scalar iterations still to do. If so, decide how we should
1991 : : handle those scalar iterations. The possibilities are:
1992 : :
1993 : : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
1994 : : In this case:
1995 : :
1996 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
1997 : : LOOP_VINFO_PEELING_FOR_NITER == false
1998 : :
1999 : : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2000 : : to handle the remaining scalar iterations. In this case:
2001 : :
2002 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2003 : : LOOP_VINFO_PEELING_FOR_NITER == true
2004 : :
2005 : : The MASKED_P argument specifies to what extent
2006 : : param_vect_partial_vector_usage is to be honored. For MASKED_P == 0
2007 : : no partial vectors are to be used, for MASKED_P == -1 it's
2008 : : param_vect_partial_vector_usage that gets to decide whether we may
2009 : : consider partial vector usage. For MASKED_P == 1 partial vectors
2010 : : may be used if possible.
2011 : :
2012 : : */
2013 : :
2014 : : static opt_result
2015 : 113067 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2016 : : int masked_p)
2017 : : {
2018 : : /* Determine whether there would be any scalar iterations left over. */
2019 : 113067 : bool need_peeling_or_partial_vectors_p
2020 : 113067 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2021 : :
2022 : : /* Decide whether to vectorize the loop with partial vectors. */
2023 : 113067 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2024 : 113067 : if (masked_p == 0
2025 : 113067 : || (masked_p == -1 && param_vect_partial_vector_usage == 0))
2026 : : /* If requested explicitly do not use partial vectors. */
2027 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2028 : 113 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2029 : 39 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2030 : 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2031 : 113 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2032 : 39 : && need_peeling_or_partial_vectors_p)
2033 : : {
2034 : : /* For partial-vector-usage=1, try to push the handling of partial
2035 : : vectors to the epilogue, with the main loop continuing to operate
2036 : : on full vectors.
2037 : :
2038 : : If we are unrolling we also do not want to use partial vectors. This
2039 : : is to avoid the overhead of generating multiple masks and also to
2040 : : avoid having to execute entire iterations of FALSE masked instructions
2041 : : when dealing with one or less full iterations.
2042 : :
2043 : : ??? We could then end up failing to use partial vectors if we
2044 : : decide to peel iterations into a prologue, and if the main loop
2045 : : then ends up processing fewer than VF iterations. */
2046 : 32 : if ((param_vect_partial_vector_usage == 1
2047 : 8 : || loop_vinfo->suggested_unroll_factor > 1)
2048 : 24 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2049 : 49 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2050 : : ;
2051 : : else
2052 : 25 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2053 : : }
2054 : :
2055 : 113067 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2056 : 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2057 : 0 : return opt_result::failure_at (vect_location,
2058 : : "not vectorized: loop needs but cannot "
2059 : : "use partial vectors\n");
2060 : :
2061 : 113067 : if (dump_enabled_p ())
2062 : 11500 : dump_printf_loc (MSG_NOTE, vect_location,
2063 : : "operating on %s vectors%s.\n",
2064 : 11500 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2065 : : ? "partial" : "full",
2066 : 11500 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2067 : : ? " for epilogue loop" : "");
2068 : :
2069 : 113067 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2070 : 226134 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2071 : 113067 : && need_peeling_or_partial_vectors_p);
2072 : :
2073 : 113067 : return opt_result::success ();
2074 : : }
2075 : :
2076 : : /* Function vect_analyze_loop_2.
2077 : :
2078 : : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2079 : : analyses will record information in some members of LOOP_VINFO. FATAL
2080 : : indicates if some analysis meets fatal error. If one non-NULL pointer
2081 : : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2082 : : worked out suggested unroll factor, while one NULL pointer shows it's
2083 : : going to apply the suggested unroll factor.
2084 : : SINGLE_LANE_SLP_DONE_FOR_SUGGESTED_UF is to hold whether single-lane
2085 : : slp was forced when the suggested unroll factor was worked out. */
2086 : : static opt_result
2087 : 414948 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, int masked_p, bool &fatal,
2088 : : unsigned *suggested_unroll_factor,
2089 : : bool& single_lane_slp_done_for_suggested_uf)
2090 : : {
2091 : 414948 : opt_result ok = opt_result::success ();
2092 : 414948 : int res;
2093 : 414948 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2094 : 414948 : loop_vec_info orig_loop_vinfo = NULL;
2095 : :
2096 : : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2097 : : loop_vec_info of the first vectorized loop. */
2098 : 414948 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2099 : 17675 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2100 : : else
2101 : : orig_loop_vinfo = loop_vinfo;
2102 : 17675 : gcc_assert (orig_loop_vinfo);
2103 : :
2104 : : /* The first group of checks is independent of the vector size. */
2105 : 414948 : fatal = true;
2106 : :
2107 : 414948 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2108 : 414948 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2109 : 5 : return opt_result::failure_at (vect_location,
2110 : : "not vectorized: simd if(0)\n");
2111 : :
2112 : : /* Find all data references in the loop (which correspond to vdefs/vuses)
2113 : : and analyze their evolution in the loop. */
2114 : :
2115 : 414943 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2116 : :
2117 : : /* Gather the data references and count stmts in the loop. */
2118 : 414943 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2119 : : {
2120 : 222245 : opt_result res
2121 : 222245 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2122 : : &LOOP_VINFO_DATAREFS (loop_vinfo));
2123 : 222245 : if (!res)
2124 : : {
2125 : 44780 : if (dump_enabled_p ())
2126 : 1561 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2127 : : "not vectorized: loop contains function "
2128 : : "calls or data references that cannot "
2129 : : "be analyzed\n");
2130 : 44780 : return res;
2131 : : }
2132 : 177465 : loop_vinfo->shared->save_datarefs ();
2133 : : }
2134 : : else
2135 : 192698 : loop_vinfo->shared->check_datarefs ();
2136 : :
2137 : : /* Analyze the data references and also adjust the minimal
2138 : : vectorization factor according to the loads and stores. */
2139 : :
2140 : 370163 : ok = vect_analyze_data_refs (loop_vinfo, &fatal);
2141 : 370163 : if (!ok)
2142 : : {
2143 : 49752 : if (dump_enabled_p ())
2144 : 987 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2145 : : "bad data references.\n");
2146 : 49752 : return ok;
2147 : : }
2148 : :
2149 : : /* Check if we are applying unroll factor now. */
2150 : 320411 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2151 : 320411 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2152 : :
2153 : : /* When single-lane SLP was forced and we are applying suggested unroll
2154 : : factor, keep that decision here. */
2155 : 640822 : bool force_single_lane = (applying_suggested_uf
2156 : 320411 : && single_lane_slp_done_for_suggested_uf);
2157 : :
2158 : : /* Classify all cross-iteration scalar data-flow cycles.
2159 : : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2160 : 320411 : vect_analyze_scalar_cycles (loop_vinfo);
2161 : :
2162 : 320411 : vect_pattern_recog (loop_vinfo);
2163 : :
2164 : : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2165 : : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2166 : :
2167 : 320411 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2168 : 320411 : if (!ok)
2169 : : {
2170 : 7058 : if (dump_enabled_p ())
2171 : 262 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2172 : : "bad data access.\n");
2173 : 7058 : return ok;
2174 : : }
2175 : :
2176 : : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2177 : :
2178 : 313353 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2179 : 313353 : if (!ok)
2180 : : {
2181 : 13520 : if (dump_enabled_p ())
2182 : 304 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2183 : : "unexpected pattern.\n");
2184 : 13520 : return ok;
2185 : : }
2186 : :
2187 : : /* While the rest of the analysis below depends on it in some way. */
2188 : 299833 : fatal = false;
2189 : :
2190 : : /* Analyze data dependences between the data-refs in the loop
2191 : : and adjust the maximum vectorization factor according to
2192 : : the dependences.
2193 : : FORNOW: fail at the first data dependence that we encounter. */
2194 : :
2195 : 299833 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2196 : 299833 : if (!ok)
2197 : : {
2198 : 14338 : if (dump_enabled_p ())
2199 : 372 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2200 : : "bad data dependence.\n");
2201 : 14338 : return ok;
2202 : : }
2203 : 285495 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2204 : :
2205 : : /* Compute the scalar iteration cost. */
2206 : 285495 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2207 : :
2208 : 285495 : bool saved_can_use_partial_vectors_p
2209 : : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2210 : :
2211 : : /* This is the point where we can re-start analysis with single-lane
2212 : : SLP forced. */
2213 : 406439 : start_over:
2214 : :
2215 : : /* Check the SLP opportunities in the loop, analyze and build
2216 : : SLP trees. */
2217 : 812878 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2218 : : force_single_lane);
2219 : 406439 : if (!ok)
2220 : 26734 : return ok;
2221 : :
2222 : : /* If there are any SLP instances mark them as pure_slp and compute
2223 : : the overall vectorization factor. */
2224 : 379705 : if (!vect_make_slp_decision (loop_vinfo))
2225 : 38683 : return opt_result::failure_at (vect_location, "no stmts to vectorize.\n");
2226 : :
2227 : 341022 : if (dump_enabled_p ())
2228 : 17790 : dump_printf_loc (MSG_NOTE, vect_location, "Loop contains only SLP stmts\n");
2229 : :
2230 : : /* Dump the vectorization factor from the SLP decision. */
2231 : 341022 : if (dump_enabled_p ())
2232 : : {
2233 : 17790 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
2234 : 17790 : dump_dec (MSG_NOTE, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2235 : 17790 : dump_printf (MSG_NOTE, "\n");
2236 : : }
2237 : :
2238 : : /* We don't expect to have to roll back to anything other than an empty
2239 : : set of rgroups. */
2240 : 341022 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2241 : :
2242 : : /* Apply the suggested unrolling factor, this was determined by the backend
2243 : : during finish_cost the first time we ran the analyzis for this
2244 : : vector mode. */
2245 : 341022 : if (applying_suggested_uf)
2246 : 247 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2247 : :
2248 : : /* Now the vectorization factor is final. */
2249 : 341022 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2250 : 341022 : gcc_assert (known_ne (vectorization_factor, 0U));
2251 : :
2252 : : /* Optimize the SLP graph with the vectorization factor fixed. */
2253 : 341022 : vect_optimize_slp (loop_vinfo);
2254 : :
2255 : : /* Gather the loads reachable from the SLP graph entries. */
2256 : 341022 : vect_gather_slp_loads (loop_vinfo);
2257 : :
2258 : 341022 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2259 : : {
2260 : 13419 : dump_printf_loc (MSG_NOTE, vect_location,
2261 : : "vectorization_factor = ");
2262 : 13419 : dump_dec (MSG_NOTE, vectorization_factor);
2263 : 13419 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2264 : 13419 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2265 : : }
2266 : :
2267 : 341022 : if (max_vf != MAX_VECTORIZATION_FACTOR
2268 : 341022 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2269 : 41 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2270 : :
2271 : 340981 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2272 : :
2273 : : /* Analyze the alignment of the data-refs in the loop.
2274 : : Fail if a data reference is found that cannot be vectorized. */
2275 : :
2276 : 340981 : ok = vect_analyze_data_refs_alignment (loop_vinfo);
2277 : 340981 : if (!ok)
2278 : : {
2279 : 0 : if (dump_enabled_p ())
2280 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2281 : : "bad data alignment.\n");
2282 : 0 : return ok;
2283 : : }
2284 : :
2285 : : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2286 : : It is important to call pruning after vect_analyze_data_ref_accesses,
2287 : : since we use grouping information gathered by interleaving analysis. */
2288 : 340981 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2289 : 340981 : if (!ok)
2290 : 16739 : return ok;
2291 : :
2292 : : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2293 : : vectorization, since we do not want to add extra peeling or
2294 : : add versioning for alignment. */
2295 : 324242 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2296 : : /* This pass will decide on using loop versioning and/or loop peeling in
2297 : : order to enhance the alignment of data references in the loop. */
2298 : 309411 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2299 : 324242 : if (!ok)
2300 : 0 : return ok;
2301 : :
2302 : : /* Analyze operations in the SLP instances. We can't simply
2303 : : remove unsupported SLP instances as this makes the above
2304 : : SLP kind detection invalid and might also affect the VF. */
2305 : 324242 : if (! vect_slp_analyze_operations (loop_vinfo))
2306 : : {
2307 : 211175 : ok = opt_result::failure_at (vect_location,
2308 : : "unsupported SLP instances\n");
2309 : 211175 : goto again;
2310 : : }
2311 : :
2312 : : /* For now, we don't expect to mix both masking and length approaches for one
2313 : : loop, disable it if both are recorded. */
2314 : 113067 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2315 : 15654 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2316 : 128715 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2317 : : {
2318 : 0 : if (dump_enabled_p ())
2319 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320 : : "can't vectorize a loop with partial vectors"
2321 : : " because we don't expect to mix different"
2322 : : " approaches with partial vectors for the"
2323 : : " same loop.\n");
2324 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2325 : : }
2326 : :
2327 : : /* If we still have the option of using partial vectors,
2328 : : check whether we can generate the necessary loop controls. */
2329 : 113067 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2330 : : {
2331 : 15654 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2332 : : {
2333 : 15648 : if (!vect_verify_full_masking (loop_vinfo)
2334 : 15648 : && !vect_verify_full_masking_avx512 (loop_vinfo))
2335 : 3108 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2336 : : }
2337 : : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2338 : 6 : if (!vect_verify_loop_lens (loop_vinfo))
2339 : 6 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2340 : : }
2341 : :
2342 : : /* Decide whether this loop_vinfo should use partial vectors or peeling,
2343 : : assuming that the loop will be used as a main loop. We will redo
2344 : : this analysis later if we instead decide to use the loop as an
2345 : : epilogue loop. */
2346 : 113067 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, masked_p);
2347 : 113067 : if (!ok)
2348 : 0 : return ok;
2349 : :
2350 : : /* If we're vectorizing a loop that uses length "controls" and
2351 : : can iterate more than once, we apply decrementing IV approach
2352 : : in loop control. */
2353 : 113067 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2354 : 25 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2355 : 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2356 : 113067 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2357 : 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2358 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2359 : 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2360 : :
2361 : : /* If a loop uses length controls and has a decrementing loop control IV,
2362 : : we will normally pass that IV through a MIN_EXPR to calcaluate the
2363 : : basis for the length controls. E.g. in a loop that processes one
2364 : : element per scalar iteration, the number of elements would be
2365 : : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2366 : :
2367 : : This MIN_EXPR approach allows us to use pointer IVs with an invariant
2368 : : step, since only the final iteration of the vector loop can have
2369 : : inactive lanes.
2370 : :
2371 : : However, some targets have a dedicated instruction for calculating the
2372 : : preferred length, given the total number of elements that still need to
2373 : : be processed. This is encapsulated in the SELECT_VL internal function.
2374 : :
2375 : : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2376 : : to determine the basis for the length controls. However, unlike the
2377 : : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2378 : : lanes inactive in any iteration of the vector loop, not just the last
2379 : : iteration. This SELECT_VL approach therefore requires us to use pointer
2380 : : IVs with variable steps.
2381 : :
2382 : : Once we've decided how many elements should be processed by one
2383 : : iteration of the vector loop, we need to populate the rgroup controls.
2384 : : If a loop has multiple rgroups, we need to make sure that those rgroups
2385 : : "line up" (that is, they must be consistent about which elements are
2386 : : active and which aren't). This is done by vect_adjust_loop_lens_control.
2387 : :
2388 : : In principle, it would be possible to use vect_adjust_loop_lens_control
2389 : : on either the result of a MIN_EXPR or the result of a SELECT_VL.
2390 : : However:
2391 : :
2392 : : (1) In practice, it only makes sense to use SELECT_VL when a vector
2393 : : operation will be controlled directly by the result. It is not
2394 : : worth using SELECT_VL if it would only be the input to other
2395 : : calculations.
2396 : :
2397 : : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2398 : : pointer IV will need N updates by a variable amount (N-1 updates
2399 : : within the iteration and 1 update to move to the next iteration).
2400 : :
2401 : : Because of this, we prefer to use the MIN_EXPR approach whenever there
2402 : : is more than one length control.
2403 : :
2404 : : In addition, SELECT_VL always operates to a granularity of 1 unit.
2405 : : If we wanted to use it to control an SLP operation on N consecutive
2406 : : elements, we would need to make the SELECT_VL inputs measure scalar
2407 : : iterations (rather than elements) and then multiply the SELECT_VL
2408 : : result by N. But using SELECT_VL this way is inefficient because
2409 : : of (1) above.
2410 : :
2411 : : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2412 : : satisfied:
2413 : :
2414 : : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2415 : : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2416 : :
2417 : : Since SELECT_VL (variable step) will make SCEV analysis failed and then
2418 : : we will fail to gain benefits of following unroll optimizations. We prefer
2419 : : using the MIN_EXPR approach in this situation. */
2420 : 113067 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2421 : : {
2422 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2423 : 0 : if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
2424 : : OPTIMIZE_FOR_SPEED)
2425 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () == 1
2426 : 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
2427 : 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2428 : : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2429 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2430 : :
2431 : : /* If any of the SLP instances cover more than a single lane
2432 : : we cannot use .SELECT_VL at the moment, even if the number
2433 : : of lanes is uniform throughout the SLP graph. */
2434 : 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2435 : 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
2436 : 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
2437 : 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
2438 : 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
2439 : : {
2440 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2441 : 0 : break;
2442 : : }
2443 : : }
2444 : :
2445 : : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2446 : : to be able to handle fewer than VF scalars, or needs to have a lower VF
2447 : : than the main loop. */
2448 : 113067 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2449 : 12448 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2450 : : {
2451 : 12439 : poly_uint64 unscaled_vf
2452 : 12439 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2453 : : orig_loop_vinfo->suggested_unroll_factor);
2454 : 12439 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
2455 : 285 : return opt_result::failure_at (vect_location,
2456 : : "Vectorization factor too high for"
2457 : : " epilogue loop.\n");
2458 : : }
2459 : :
2460 : : /* If the epilogue needs peeling for gaps but the main loop doesn't give
2461 : : up on the epilogue. */
2462 : 112782 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2463 : 12163 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2464 : 62 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
2465 : : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2466 : 4 : return opt_result::failure_at (vect_location,
2467 : : "Epilogue loop requires peeling for gaps "
2468 : : "but main loop does not.\n");
2469 : :
2470 : : /* If an epilogue loop is required make sure we can create one. */
2471 : 112778 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2472 : 111549 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2473 : 32687 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
2474 : : {
2475 : 81125 : if (dump_enabled_p ())
2476 : 5069 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2477 : 81125 : if (!vect_can_advance_ivs_p (loop_vinfo)
2478 : 161752 : || !slpeel_can_duplicate_loop_p (loop,
2479 : : LOOP_VINFO_IV_EXIT (loop_vinfo),
2480 : 80627 : LOOP_VINFO_IV_EXIT (loop_vinfo)))
2481 : : {
2482 : 498 : ok = opt_result::failure_at (vect_location,
2483 : : "not vectorized: can't create required "
2484 : : "epilog loop\n");
2485 : 498 : goto again;
2486 : : }
2487 : : }
2488 : :
2489 : : /* Check the costings of the loop make vectorizing worthwhile. */
2490 : 112280 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2491 : 112280 : if (res < 0)
2492 : : {
2493 : 30138 : ok = opt_result::failure_at (vect_location,
2494 : : "Loop costings may not be worthwhile.\n");
2495 : 30138 : goto again;
2496 : : }
2497 : 82142 : if (!res)
2498 : 21324 : return opt_result::failure_at (vect_location,
2499 : : "Loop costings not worthwhile.\n");
2500 : :
2501 : : /* During peeling, we need to check if number of loop iterations is
2502 : : enough for both peeled prolog loop and vector loop. This check
2503 : : can be merged along with threshold check of loop versioning, so
2504 : : increase threshold for this case if necessary.
2505 : :
2506 : : If we are analyzing an epilogue we still want to check what its
2507 : : versioning threshold would be. If we decide to vectorize the epilogues we
2508 : : will want to use the lowest versioning threshold of all epilogues and main
2509 : : loop. This will enable us to enter a vectorized epilogue even when
2510 : : versioning the loop. We can't simply check whether the epilogue requires
2511 : : versioning though since we may have skipped some versioning checks when
2512 : : analyzing the epilogue. For instance, checks for alias versioning will be
2513 : : skipped when dealing with epilogues as we assume we already checked them
2514 : : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2515 : 60818 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2516 : : {
2517 : 5617 : poly_uint64 niters_th = 0;
2518 : 5617 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2519 : :
2520 : 5617 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2521 : : {
2522 : : /* Niters for peeled prolog loop. */
2523 : 5617 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2524 : : {
2525 : 126 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2526 : 126 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2527 : 126 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2528 : : }
2529 : : else
2530 : 5491 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2531 : : }
2532 : :
2533 : : /* Niters for at least one iteration of vectorized loop. */
2534 : 5617 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2535 : 5613 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2536 : : /* One additional iteration because of peeling for gap. */
2537 : 5617 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2538 : 59 : niters_th += 1;
2539 : :
2540 : : /* Use the same condition as vect_transform_loop to decide when to use
2541 : : the cost to determine a versioning threshold. */
2542 : 5617 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2543 : 5617 : && ordered_p (th, niters_th))
2544 : 3811 : niters_th = ordered_max (poly_uint64 (th), niters_th);
2545 : :
2546 : 5617 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2547 : : }
2548 : :
2549 : 60818 : gcc_assert (known_eq (vectorization_factor,
2550 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2551 : :
2552 : 60818 : single_lane_slp_done_for_suggested_uf = force_single_lane;
2553 : :
2554 : : /* Ok to vectorize! */
2555 : 60818 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2556 : 60818 : return opt_result::success ();
2557 : :
2558 : 241811 : again:
2559 : : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2560 : 241811 : gcc_assert (!ok);
2561 : :
2562 : : /* Try again with single-lane SLP. */
2563 : 241811 : if (force_single_lane)
2564 : 119820 : return ok;
2565 : :
2566 : : /* If we are applying suggested unroll factor, we don't need to
2567 : : re-try any more as we want to keep the SLP mode fixed. */
2568 : 121991 : if (applying_suggested_uf)
2569 : 6 : return ok;
2570 : :
2571 : : /* Likewise if the grouped loads or stores in the SLP cannot be handled
2572 : : via interleaving or lane instructions. */
2573 : : slp_instance instance;
2574 : : slp_tree node;
2575 : : unsigned i, j;
2576 : 482638 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2577 : : {
2578 : 361694 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
2579 : 0 : continue;
2580 : :
2581 : 361694 : stmt_vec_info vinfo;
2582 : 361694 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2583 : 361694 : if (!vinfo || !STMT_VINFO_GROUPED_ACCESS (vinfo))
2584 : 359109 : continue;
2585 : 2585 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2586 : 2585 : unsigned int size = DR_GROUP_SIZE (vinfo);
2587 : 2585 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
2588 : 2585 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
2589 : 4484 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2590 : 5162 : && ! vect_grouped_store_supported (vectype, size))
2591 : 678 : return opt_result::failure_at (vinfo->stmt,
2592 : : "unsupported grouped store\n");
2593 : 363925 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2594 : : {
2595 : 2065 : vinfo = SLP_TREE_REPRESENTATIVE (node);
2596 : 2065 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
2597 : : {
2598 : 1806 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2599 : 1806 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2600 : 1806 : size = DR_GROUP_SIZE (vinfo);
2601 : 1806 : vectype = SLP_TREE_VECTYPE (node);
2602 : 1806 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
2603 : 1806 : && ! vect_grouped_load_supported (vectype, single_element_p,
2604 : : size))
2605 : 363 : return opt_result::failure_at (vinfo->stmt,
2606 : : "unsupported grouped load\n");
2607 : : }
2608 : : }
2609 : : }
2610 : :
2611 : : /* Roll back state appropriately. Force single-lane SLP this time. */
2612 : 120944 : force_single_lane = true;
2613 : 120944 : if (dump_enabled_p ())
2614 : 3233 : dump_printf_loc (MSG_NOTE, vect_location,
2615 : : "re-trying with single-lane SLP\n");
2616 : :
2617 : : /* Reset the vectorization factor. */
2618 : 120944 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = 0;
2619 : : /* Free the SLP instances. */
2620 : 481581 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2621 : 360637 : vect_free_slp_instance (instance);
2622 : 120944 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2623 : : /* Reset SLP type to loop_vect on all stmts. */
2624 : 469992 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2625 : : {
2626 : 349048 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2627 : 349048 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
2628 : 623690 : !gsi_end_p (si); gsi_next (&si))
2629 : : {
2630 : 274642 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2631 : 274642 : STMT_SLP_TYPE (stmt_info) = not_vect;
2632 : 274642 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2633 : 274642 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2634 : : {
2635 : : /* vectorizable_reduction adjusts reduction stmt def-types,
2636 : : restore them to that of the PHI. */
2637 : 19811 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2638 : 19811 : = STMT_VINFO_DEF_TYPE (stmt_info);
2639 : 19811 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2640 : : (STMT_VINFO_REDUC_DEF (stmt_info)))
2641 : 19811 : = STMT_VINFO_DEF_TYPE (stmt_info);
2642 : : }
2643 : : }
2644 : 698096 : for (gimple_stmt_iterator si = gsi_start_bb (bb);
2645 : 2139101 : !gsi_end_p (si); gsi_next (&si))
2646 : : {
2647 : 1790053 : if (is_gimple_debug (gsi_stmt (si)))
2648 : 654313 : continue;
2649 : 1135740 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2650 : 1135740 : STMT_SLP_TYPE (stmt_info) = not_vect;
2651 : 1135740 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2652 : : {
2653 : 210672 : stmt_vec_info pattern_stmt_info
2654 : : = STMT_VINFO_RELATED_STMT (stmt_info);
2655 : 210672 : if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2656 : 0 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2657 : :
2658 : 210672 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2659 : 210672 : STMT_SLP_TYPE (pattern_stmt_info) = not_vect;
2660 : 210672 : for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2661 : 428241 : !gsi_end_p (pi); gsi_next (&pi))
2662 : 217569 : STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2663 : 217569 : = not_vect;
2664 : : }
2665 : : }
2666 : : }
2667 : : /* Free optimized alias test DDRS. */
2668 : 120944 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2669 : 120944 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2670 : 120944 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2671 : : /* Reset target cost data. */
2672 : 120944 : delete loop_vinfo->vector_costs;
2673 : 120944 : loop_vinfo->vector_costs = nullptr;
2674 : : /* Reset accumulated rgroup information. */
2675 : 120944 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
2676 : 120944 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
2677 : 120944 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2678 : : /* Reset assorted flags. */
2679 : 120944 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2680 : 120944 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2681 : 120944 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2682 : 120944 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2683 : 120944 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2684 : 120944 : = saved_can_use_partial_vectors_p;
2685 : 120944 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2686 : 120944 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2687 : 120944 : if (loop_vinfo->scan_map)
2688 : 122 : loop_vinfo->scan_map->empty ();
2689 : :
2690 : 120944 : goto start_over;
2691 : : }
2692 : :
2693 : : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2694 : : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2695 : : OLD_LOOP_VINFO is better unless something specifically indicates
2696 : : otherwise.
2697 : :
2698 : : Note that this deliberately isn't a partial order. */
2699 : :
2700 : : static bool
2701 : 0 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2702 : : loop_vec_info old_loop_vinfo)
2703 : : {
2704 : 0 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2705 : 0 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2706 : :
2707 : 0 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2708 : 0 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2709 : :
2710 : : /* Always prefer a VF of loop->simdlen over any other VF. */
2711 : 0 : if (loop->simdlen)
2712 : : {
2713 : 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2714 : 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2715 : 0 : if (new_simdlen_p != old_simdlen_p)
2716 : : return new_simdlen_p;
2717 : : }
2718 : :
2719 : 0 : const auto *old_costs = old_loop_vinfo->vector_costs;
2720 : 0 : const auto *new_costs = new_loop_vinfo->vector_costs;
2721 : 0 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2722 : 0 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2723 : :
2724 : 0 : return new_costs->better_main_loop_than_p (old_costs);
2725 : : }
2726 : :
2727 : : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2728 : : true if we should. */
2729 : :
2730 : : static bool
2731 : 0 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2732 : : loop_vec_info old_loop_vinfo)
2733 : : {
2734 : 0 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2735 : : return false;
2736 : :
2737 : 0 : if (dump_enabled_p ())
2738 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2739 : : "***** Preferring vector mode %s to vector mode %s\n",
2740 : 0 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
2741 : 0 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
2742 : : return true;
2743 : : }
2744 : :
2745 : : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
2746 : : not NULL. When MASKED_P is not -1 override the default
2747 : : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P with it.
2748 : : Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance MODE_I to the next
2749 : : mode useful to analyze.
2750 : : Return the loop_vinfo on success and wrapped null on failure. */
2751 : :
2752 : : static opt_loop_vec_info
2753 : 414701 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2754 : : const vect_loop_form_info *loop_form_info,
2755 : : loop_vec_info orig_loop_vinfo,
2756 : : const vector_modes &vector_modes, unsigned &mode_i,
2757 : : int masked_p,
2758 : : machine_mode &autodetected_vector_mode,
2759 : : bool &fatal)
2760 : : {
2761 : 414701 : loop_vec_info loop_vinfo
2762 : 414701 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
2763 : :
2764 : 414701 : machine_mode vector_mode = vector_modes[mode_i];
2765 : 414701 : loop_vinfo->vector_mode = vector_mode;
2766 : 414701 : unsigned int suggested_unroll_factor = 1;
2767 : 414701 : bool single_lane_slp_done_for_suggested_uf = false;
2768 : :
2769 : : /* Run the main analysis. */
2770 : 414701 : opt_result res = vect_analyze_loop_2 (loop_vinfo, masked_p, fatal,
2771 : : &suggested_unroll_factor,
2772 : : single_lane_slp_done_for_suggested_uf);
2773 : 414701 : if (dump_enabled_p ())
2774 : 19319 : dump_printf_loc (MSG_NOTE, vect_location,
2775 : : "***** Analysis %s with vector mode %s\n",
2776 : 19319 : res ? "succeeded" : "failed",
2777 : 19319 : GET_MODE_NAME (loop_vinfo->vector_mode));
2778 : :
2779 : 414701 : auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
2780 : 414701 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2781 : : /* Check to see if the user wants to unroll or if the target wants to. */
2782 : 468467 : && (suggested_unroll_factor > 1 || user_unroll > 1))
2783 : : {
2784 : 261 : if (suggested_unroll_factor == 1)
2785 : : {
2786 : 44 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
2787 : 44 : suggested_unroll_factor = user_unroll / assumed_vf;
2788 : 44 : if (suggested_unroll_factor > 1)
2789 : : {
2790 : 30 : if (dump_enabled_p ())
2791 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
2792 : : "setting unroll factor to %d based on user requested "
2793 : : "unroll factor %d and suggested vectorization "
2794 : : "factor: %d\n",
2795 : : suggested_unroll_factor, user_unroll, assumed_vf);
2796 : : }
2797 : : }
2798 : :
2799 : 261 : if (suggested_unroll_factor > 1)
2800 : : {
2801 : 247 : if (dump_enabled_p ())
2802 : 44 : dump_printf_loc (MSG_NOTE, vect_location,
2803 : : "***** Re-trying analysis for unrolling"
2804 : : " with unroll factor %d and %s slp.\n",
2805 : : suggested_unroll_factor,
2806 : : single_lane_slp_done_for_suggested_uf
2807 : : ? "single-lane" : "");
2808 : 247 : loop_vec_info unroll_vinfo
2809 : 247 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
2810 : 247 : unroll_vinfo->vector_mode = vector_mode;
2811 : 247 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2812 : 247 : opt_result new_res
2813 : 247 : = vect_analyze_loop_2 (unroll_vinfo, masked_p, fatal, NULL,
2814 : : single_lane_slp_done_for_suggested_uf);
2815 : 247 : if (new_res)
2816 : : {
2817 : 201 : delete loop_vinfo;
2818 : 201 : loop_vinfo = unroll_vinfo;
2819 : : }
2820 : : else
2821 : 46 : delete unroll_vinfo;
2822 : : }
2823 : :
2824 : : /* Record that we have honored a user unroll factor. */
2825 : 261 : LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
2826 : : }
2827 : :
2828 : : /* Remember the autodetected vector mode. */
2829 : 414701 : if (vector_mode == VOIDmode)
2830 : 214353 : autodetected_vector_mode = loop_vinfo->vector_mode;
2831 : :
2832 : : /* Advance mode_i, first skipping modes that would result in the
2833 : : same analysis result. */
2834 : 1879787 : while (mode_i + 1 < vector_modes.length ()
2835 : 1317739 : && vect_chooses_same_modes_p (loop_vinfo,
2836 : 585196 : vector_modes[mode_i + 1]))
2837 : : {
2838 : 317842 : if (dump_enabled_p ())
2839 : 16078 : dump_printf_loc (MSG_NOTE, vect_location,
2840 : : "***** The result for vector mode %s would"
2841 : : " be the same\n",
2842 : 16078 : GET_MODE_NAME (vector_modes[mode_i + 1]));
2843 : 317842 : mode_i += 1;
2844 : : }
2845 : 414701 : if (mode_i + 1 < vector_modes.length ()
2846 : 682055 : && vect_chooses_same_modes_p (autodetected_vector_mode,
2847 : 267354 : vector_modes[mode_i + 1]))
2848 : : {
2849 : 349 : if (dump_enabled_p ())
2850 : 10 : dump_printf_loc (MSG_NOTE, vect_location,
2851 : : "***** Skipping vector mode %s, which would"
2852 : : " repeat the analysis for %s\n",
2853 : 10 : GET_MODE_NAME (vector_modes[mode_i + 1]),
2854 : 10 : GET_MODE_NAME (autodetected_vector_mode));
2855 : 349 : mode_i += 1;
2856 : : }
2857 : 414701 : mode_i++;
2858 : :
2859 : 414701 : if (!res)
2860 : : {
2861 : 354084 : delete loop_vinfo;
2862 : 354084 : if (fatal)
2863 : 65369 : gcc_checking_assert (orig_loop_vinfo == NULL);
2864 : 354084 : return opt_loop_vec_info::propagate_failure (res);
2865 : : }
2866 : :
2867 : 60617 : return opt_loop_vec_info::success (loop_vinfo);
2868 : : }
2869 : :
2870 : : /* Function vect_analyze_loop.
2871 : :
2872 : : Apply a set of analyses on LOOP, and create a loop_vec_info struct
2873 : : for it. The different analyses will record information in the
2874 : : loop_vec_info struct. */
2875 : : opt_loop_vec_info
2876 : 479006 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
2877 : : vec_info_shared *shared)
2878 : : {
2879 : 479006 : DUMP_VECT_SCOPE ("analyze_loop_nest");
2880 : :
2881 : 479006 : if (loop_outer (loop)
2882 : 479006 : && loop_vec_info_for_loop (loop_outer (loop))
2883 : 479507 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2884 : 501 : return opt_loop_vec_info::failure_at (vect_location,
2885 : : "outer-loop already vectorized.\n");
2886 : :
2887 : 478505 : if (!find_loop_nest (loop, &shared->loop_nest))
2888 : 24688 : return opt_loop_vec_info::failure_at
2889 : 24688 : (vect_location,
2890 : : "not vectorized: loop nest containing two or more consecutive inner"
2891 : : " loops cannot be vectorized\n");
2892 : :
2893 : : /* Analyze the loop form. */
2894 : 453817 : vect_loop_form_info loop_form_info;
2895 : 453817 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
2896 : : &loop_form_info);
2897 : 453817 : if (!res)
2898 : : {
2899 : 239464 : if (dump_enabled_p ())
2900 : 1637 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2901 : : "bad loop form.\n");
2902 : 239464 : return opt_loop_vec_info::propagate_failure (res);
2903 : : }
2904 : 214353 : if (!integer_onep (loop_form_info.assumptions))
2905 : : {
2906 : : /* We consider to vectorize this loop by versioning it under
2907 : : some assumptions. In order to do this, we need to clear
2908 : : existing information computed by scev and niter analyzer. */
2909 : 10055 : scev_reset_htab ();
2910 : 10055 : free_numbers_of_iterations_estimates (loop);
2911 : : /* Also set flag for this loop so that following scev and niter
2912 : : analysis are done under the assumptions. */
2913 : 10055 : loop_constraint_set (loop, LOOP_C_FINITE);
2914 : : }
2915 : : else
2916 : : /* Clear the existing niter information to make sure the nonwrapping flag
2917 : : will be calculated and set propriately. */
2918 : 204298 : free_numbers_of_iterations_estimates (loop);
2919 : :
2920 : 214353 : auto_vector_modes vector_modes;
2921 : : /* Autodetect first vector size we try. */
2922 : 214353 : vector_modes.safe_push (VOIDmode);
2923 : 214353 : unsigned int autovec_flags
2924 : 428706 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2925 : 214353 : loop->simdlen != 0);
2926 : 214353 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2927 : 214353 : && !unlimited_cost_model (loop));
2928 : 214353 : machine_mode autodetected_vector_mode = VOIDmode;
2929 : 214353 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2930 : 214353 : unsigned int mode_i = 0;
2931 : 214353 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2932 : :
2933 : : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
2934 : : a mode has not been analyzed. */
2935 : 214353 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
2936 : 2163598 : for (unsigned i = 0; i < vector_modes.length (); ++i)
2937 : 867446 : cached_vf_per_mode.safe_push (0);
2938 : :
2939 : : /* First determine the main loop vectorization mode, either the first
2940 : : one that works, starting with auto-detecting the vector mode and then
2941 : : following the targets order of preference, or the one with the
2942 : : lowest cost if pick_lowest_cost_p. */
2943 : 579699 : while (1)
2944 : : {
2945 : 397026 : bool fatal;
2946 : 397026 : unsigned int last_mode_i = mode_i;
2947 : : /* Set cached VF to -1 prior to analysis, which indicates a mode has
2948 : : failed. */
2949 : 397026 : cached_vf_per_mode[last_mode_i] = -1;
2950 : 397026 : opt_loop_vec_info loop_vinfo
2951 : 397026 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
2952 : : NULL, vector_modes, mode_i, -1,
2953 : : autodetected_vector_mode, fatal);
2954 : 397026 : if (fatal)
2955 : : break;
2956 : :
2957 : 331657 : if (loop_vinfo)
2958 : : {
2959 : : /* Analyzis has been successful so update the VF value. The
2960 : : VF should always be a multiple of unroll_factor and we want to
2961 : : capture the original VF here. */
2962 : 53766 : cached_vf_per_mode[last_mode_i]
2963 : 53766 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2964 : 53766 : loop_vinfo->suggested_unroll_factor);
2965 : : /* Once we hit the desired simdlen for the first time,
2966 : : discard any previous attempts. */
2967 : 53766 : if (simdlen
2968 : 53766 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2969 : : {
2970 : 47 : delete first_loop_vinfo;
2971 : : first_loop_vinfo = opt_loop_vec_info::success (NULL);
2972 : : simdlen = 0;
2973 : : }
2974 : 53719 : else if (pick_lowest_cost_p
2975 : 0 : && first_loop_vinfo
2976 : 53719 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2977 : : {
2978 : : /* Pick loop_vinfo over first_loop_vinfo. */
2979 : 0 : delete first_loop_vinfo;
2980 : 0 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
2981 : : }
2982 : 53766 : if (first_loop_vinfo == NULL)
2983 : : first_loop_vinfo = loop_vinfo;
2984 : : else
2985 : : {
2986 : 2 : delete loop_vinfo;
2987 : 2 : loop_vinfo = opt_loop_vec_info::success (NULL);
2988 : : }
2989 : :
2990 : : /* Commit to first_loop_vinfo if we have no reason to try
2991 : : alternatives. */
2992 : 53766 : if (!simdlen && !pick_lowest_cost_p)
2993 : : break;
2994 : : }
2995 : 277900 : if (mode_i == vector_modes.length ()
2996 : 277900 : || autodetected_vector_mode == VOIDmode)
2997 : : break;
2998 : :
2999 : : /* Try the next biggest vector size. */
3000 : 182673 : if (dump_enabled_p ())
3001 : 3737 : dump_printf_loc (MSG_NOTE, vect_location,
3002 : : "***** Re-trying analysis with vector mode %s\n",
3003 : 3737 : GET_MODE_NAME (vector_modes[mode_i]));
3004 : 182673 : }
3005 : 214353 : if (!first_loop_vinfo)
3006 : 160594 : return opt_loop_vec_info::propagate_failure (res);
3007 : :
3008 : 53759 : if (dump_enabled_p ())
3009 : 9071 : dump_printf_loc (MSG_NOTE, vect_location,
3010 : : "***** Choosing vector mode %s\n",
3011 : 9071 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3012 : :
3013 : : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3014 : : enabled, SIMDUID is not set, it is the innermost loop and we have
3015 : : either already found the loop's SIMDLEN or there was no SIMDLEN to
3016 : : begin with.
3017 : : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3018 : 53759 : bool vect_epilogues = (!simdlen
3019 : 53757 : && loop->inner == NULL
3020 : 53242 : && param_vect_epilogues_nomask
3021 : 52201 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3022 : : /* No code motion support for multiple epilogues so for now
3023 : : not supported when multiple exits. */
3024 : 25438 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3025 : 25010 : && !loop->simduid
3026 : 77356 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3027 : 53759 : if (!vect_epilogues)
3028 : 41159 : return first_loop_vinfo;
3029 : :
3030 : : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3031 : :
3032 : : /* For epilogues start the analysis from the first mode. The motivation
3033 : : behind starting from the beginning comes from cases where the VECTOR_MODES
3034 : : array may contain length-agnostic and length-specific modes. Their
3035 : : ordering is not guaranteed, so we could end up picking a mode for the main
3036 : : loop that is after the epilogue's optimal mode. */
3037 : 12600 : int masked_p = -1;
3038 : 12600 : if (!unlimited_cost_model (loop)
3039 : 12600 : && (first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3040 : : != VOIDmode))
3041 : : {
3042 : 4 : vector_modes[0]
3043 : 4 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3044 : 4 : cached_vf_per_mode[0] = 0;
3045 : : }
3046 : : else
3047 : 12596 : vector_modes[0] = autodetected_vector_mode;
3048 : 12600 : mode_i = 0;
3049 : :
3050 : 12633 : bool supports_partial_vectors = (param_vect_partial_vector_usage != 0
3051 : 12600 : || masked_p == 1);
3052 : : if (supports_partial_vectors
3053 : 33 : && !partial_vectors_supported_p ()
3054 : 33 : && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo))
3055 : : supports_partial_vectors = false;
3056 : 12600 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3057 : :
3058 : 12600 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3059 : 12788 : do
3060 : : {
3061 : : /* Let the user override what the target suggests. */
3062 : 12694 : if (OPTION_SET_P (param_vect_partial_vector_usage))
3063 : 40 : masked_p = -1;
3064 : :
3065 : 43505 : while (1)
3066 : : {
3067 : : /* If the target does not support partial vectors we can shorten the
3068 : : number of modes to analyze for the epilogue as we know we can't
3069 : : pick a mode that would lead to a VF at least as big as the
3070 : : FIRST_VINFO_VF. */
3071 : 56951 : if (!supports_partial_vectors
3072 : 43505 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3073 : : {
3074 : 13471 : mode_i++;
3075 : 26942 : if (mode_i == vector_modes.length ())
3076 : : break;
3077 : 25805 : continue;
3078 : : }
3079 : : /* We would need an exhaustive search to find all modes we
3080 : : skipped but that would lead to the same result as the
3081 : : analysis it was skipped for and where we'd could check
3082 : : cached_vf_per_mode against.
3083 : : Check for the autodetected mode, which is the common
3084 : : situation on x86 which does not perform cost comparison. */
3085 : 42393 : if (!supports_partial_vectors
3086 : 30025 : && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf)
3087 : 59520 : && vect_chooses_same_modes_p (autodetected_vector_mode,
3088 : 29486 : vector_modes[mode_i]))
3089 : : {
3090 : 12359 : mode_i++;
3091 : 24718 : if (mode_i == vector_modes.length ())
3092 : : break;
3093 : 12359 : continue;
3094 : : }
3095 : :
3096 : 17675 : if (dump_enabled_p ())
3097 : 3132 : dump_printf_loc (MSG_NOTE, vect_location,
3098 : : "***** Re-trying epilogue analysis with vector "
3099 : 3132 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3100 : :
3101 : 17675 : bool fatal;
3102 : 17675 : opt_loop_vec_info loop_vinfo
3103 : 17675 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3104 : : orig_loop_vinfo,
3105 : : vector_modes, mode_i, masked_p,
3106 : : autodetected_vector_mode, fatal);
3107 : 17675 : if (fatal)
3108 : : break;
3109 : :
3110 : 17675 : if (loop_vinfo)
3111 : : {
3112 : 6851 : if (pick_lowest_cost_p
3113 : 0 : && orig_loop_vinfo->epilogue_vinfo
3114 : 6851 : && vect_joust_loop_vinfos (loop_vinfo,
3115 : 0 : orig_loop_vinfo->epilogue_vinfo))
3116 : : {
3117 : 0 : gcc_assert (vect_epilogues);
3118 : 0 : delete orig_loop_vinfo->epilogue_vinfo;
3119 : 0 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3120 : : }
3121 : 6851 : if (!orig_loop_vinfo->epilogue_vinfo)
3122 : 6851 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3123 : : else
3124 : : {
3125 : 0 : delete loop_vinfo;
3126 : 0 : loop_vinfo = opt_loop_vec_info::success (NULL);
3127 : : }
3128 : :
3129 : : /* For now only allow one epilogue loop, but allow
3130 : : pick_lowest_cost_p to replace it, so commit to the
3131 : : first epilogue if we have no reason to try alternatives. */
3132 : 6851 : if (!pick_lowest_cost_p)
3133 : : break;
3134 : : }
3135 : :
3136 : : /* Revert back to the default from the suggested prefered
3137 : : epilogue vectorization mode. */
3138 : 10824 : masked_p = -1;
3139 : 21648 : if (mode_i == vector_modes.length ())
3140 : : break;
3141 : : }
3142 : :
3143 : 12694 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3144 : 12694 : if (!orig_loop_vinfo)
3145 : : break;
3146 : :
3147 : : /* When we selected a first vectorized epilogue, see if the target
3148 : : suggests to have another one. */
3149 : 6851 : masked_p = -1;
3150 : 6851 : if (!unlimited_cost_model (loop)
3151 : 3988 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo)
3152 : 10832 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3153 : : != VOIDmode))
3154 : : {
3155 : 188 : vector_modes[0]
3156 : 94 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3157 : 94 : cached_vf_per_mode[0] = 0;
3158 : 94 : mode_i = 0;
3159 : : }
3160 : : else
3161 : : break;
3162 : 94 : }
3163 : : while (1);
3164 : :
3165 : 12600 : if (first_loop_vinfo->epilogue_vinfo)
3166 : : {
3167 : 6762 : poly_uint64 lowest_th
3168 : 6762 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3169 : 6762 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3170 : 6851 : do
3171 : : {
3172 : 6851 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3173 : 6851 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3174 : : || maybe_ne (lowest_th, 0U));
3175 : : /* Keep track of the known smallest versioning threshold. */
3176 : 6851 : if (ordered_p (lowest_th, th))
3177 : 6851 : lowest_th = ordered_min (lowest_th, th);
3178 : 6851 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3179 : : }
3180 : 6851 : while (epilog_vinfo);
3181 : 6762 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3182 : 6762 : if (dump_enabled_p ())
3183 : 1365 : dump_printf_loc (MSG_NOTE, vect_location,
3184 : : "***** Choosing epilogue vector mode %s\n",
3185 : 1365 : GET_MODE_NAME
3186 : : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3187 : : }
3188 : :
3189 : 12600 : return first_loop_vinfo;
3190 : 668170 : }
3191 : :
3192 : : /* Return true if there is an in-order reduction function for CODE, storing
3193 : : it in *REDUC_FN if so. */
3194 : :
3195 : : static bool
3196 : 4716 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3197 : : {
3198 : : /* We support MINUS_EXPR by negating the operand. This also preserves an
3199 : : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3200 : : (-0.0) = -0.0. */
3201 : 4716 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3202 : : {
3203 : 4040 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3204 : 0 : return true;
3205 : : }
3206 : : return false;
3207 : : }
3208 : :
3209 : : /* Function reduction_fn_for_scalar_code
3210 : :
3211 : : Input:
3212 : : CODE - tree_code of a reduction operations.
3213 : :
3214 : : Output:
3215 : : REDUC_FN - the corresponding internal function to be used to reduce the
3216 : : vector of partial results into a single scalar result, or IFN_LAST
3217 : : if the operation is a supported reduction operation, but does not have
3218 : : such an internal function.
3219 : :
3220 : : Return FALSE if CODE currently cannot be vectorized as reduction. */
3221 : :
3222 : : bool
3223 : 2013826 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3224 : : {
3225 : 2013826 : if (code.is_tree_code ())
3226 : 2013772 : switch (tree_code (code))
3227 : : {
3228 : 14205 : case MAX_EXPR:
3229 : 14205 : *reduc_fn = IFN_REDUC_MAX;
3230 : 14205 : return true;
3231 : :
3232 : 50598 : case MIN_EXPR:
3233 : 50598 : *reduc_fn = IFN_REDUC_MIN;
3234 : 50598 : return true;
3235 : :
3236 : 1085743 : case PLUS_EXPR:
3237 : 1085743 : *reduc_fn = IFN_REDUC_PLUS;
3238 : 1085743 : return true;
3239 : :
3240 : 255432 : case BIT_AND_EXPR:
3241 : 255432 : *reduc_fn = IFN_REDUC_AND;
3242 : 255432 : return true;
3243 : :
3244 : 285253 : case BIT_IOR_EXPR:
3245 : 285253 : *reduc_fn = IFN_REDUC_IOR;
3246 : 285253 : return true;
3247 : :
3248 : 43614 : case BIT_XOR_EXPR:
3249 : 43614 : *reduc_fn = IFN_REDUC_XOR;
3250 : 43614 : return true;
3251 : :
3252 : 278927 : case MULT_EXPR:
3253 : 278927 : case MINUS_EXPR:
3254 : 278927 : *reduc_fn = IFN_LAST;
3255 : 278927 : return true;
3256 : :
3257 : : default:
3258 : : return false;
3259 : : }
3260 : : else
3261 : 54 : switch (combined_fn (code))
3262 : : {
3263 : 30 : CASE_CFN_FMAX:
3264 : 30 : *reduc_fn = IFN_REDUC_FMAX;
3265 : 30 : return true;
3266 : :
3267 : 24 : CASE_CFN_FMIN:
3268 : 24 : *reduc_fn = IFN_REDUC_FMIN;
3269 : 24 : return true;
3270 : :
3271 : : default:
3272 : : return false;
3273 : : }
3274 : : }
3275 : :
3276 : : /* Set *SBOOL_FN to the corresponding function working on vector masks
3277 : : for REDUC_FN. Return true if that exists, false otherwise. */
3278 : :
3279 : : static bool
3280 : 0 : sbool_reduction_fn_for_fn (internal_fn reduc_fn, internal_fn *sbool_fn)
3281 : : {
3282 : 0 : switch (reduc_fn)
3283 : : {
3284 : 0 : case IFN_REDUC_AND:
3285 : 0 : *sbool_fn = IFN_REDUC_SBOOL_AND;
3286 : 0 : return true;
3287 : 0 : case IFN_REDUC_IOR:
3288 : 0 : *sbool_fn = IFN_REDUC_SBOOL_IOR;
3289 : 0 : return true;
3290 : 0 : case IFN_REDUC_XOR:
3291 : 0 : *sbool_fn = IFN_REDUC_SBOOL_XOR;
3292 : 0 : return true;
3293 : : default:
3294 : : return false;
3295 : : }
3296 : : }
3297 : :
3298 : : /* If there is a neutral value X such that a reduction would not be affected
3299 : : by the introduction of additional X elements, return that X, otherwise
3300 : : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3301 : : of the scalar elements. If the reduction has just a single initial value
3302 : : then INITIAL_VALUE is that value, otherwise it is null.
3303 : : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3304 : : In that case no signed zero is returned. */
3305 : :
3306 : : tree
3307 : 76907 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3308 : : tree initial_value, bool as_initial)
3309 : : {
3310 : 76907 : if (code.is_tree_code ())
3311 : 76853 : switch (tree_code (code))
3312 : : {
3313 : 11421 : case DOT_PROD_EXPR:
3314 : 11421 : case SAD_EXPR:
3315 : 11421 : case MINUS_EXPR:
3316 : 11421 : case BIT_IOR_EXPR:
3317 : 11421 : case BIT_XOR_EXPR:
3318 : 11421 : return build_zero_cst (scalar_type);
3319 : 59461 : case WIDEN_SUM_EXPR:
3320 : 59461 : case PLUS_EXPR:
3321 : 59461 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3322 : 56 : return build_real (scalar_type, dconstm0);
3323 : : else
3324 : 59405 : return build_zero_cst (scalar_type);
3325 : :
3326 : 2003 : case MULT_EXPR:
3327 : 2003 : return build_one_cst (scalar_type);
3328 : :
3329 : 1434 : case BIT_AND_EXPR:
3330 : 1434 : return build_all_ones_cst (scalar_type);
3331 : :
3332 : : case MAX_EXPR:
3333 : : case MIN_EXPR:
3334 : : return initial_value;
3335 : :
3336 : 444 : default:
3337 : 444 : return NULL_TREE;
3338 : : }
3339 : : else
3340 : 54 : switch (combined_fn (code))
3341 : : {
3342 : : CASE_CFN_FMIN:
3343 : : CASE_CFN_FMAX:
3344 : : return initial_value;
3345 : :
3346 : 0 : default:
3347 : 0 : return NULL_TREE;
3348 : : }
3349 : : }
3350 : :
3351 : : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3352 : : STMT is printed with a message MSG. */
3353 : :
3354 : : static void
3355 : 489 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3356 : : {
3357 : 489 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3358 : 489 : }
3359 : :
3360 : : /* Return true if we need an in-order reduction for operation CODE
3361 : : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3362 : : overflow must wrap. */
3363 : :
3364 : : bool
3365 : 6432399 : needs_fold_left_reduction_p (tree type, code_helper code)
3366 : : {
3367 : : /* CHECKME: check for !flag_finite_math_only too? */
3368 : 6432399 : if (SCALAR_FLOAT_TYPE_P (type))
3369 : : {
3370 : 546978 : if (code.is_tree_code ())
3371 : 546928 : switch (tree_code (code))
3372 : : {
3373 : : case MIN_EXPR:
3374 : : case MAX_EXPR:
3375 : : return false;
3376 : :
3377 : 545289 : default:
3378 : 545289 : return !flag_associative_math;
3379 : : }
3380 : : else
3381 : 50 : switch (combined_fn (code))
3382 : : {
3383 : : CASE_CFN_FMIN:
3384 : : CASE_CFN_FMAX:
3385 : : return false;
3386 : :
3387 : 2 : default:
3388 : 2 : return !flag_associative_math;
3389 : : }
3390 : : }
3391 : :
3392 : 5885421 : if (INTEGRAL_TYPE_P (type))
3393 : 5884592 : return (!code.is_tree_code ()
3394 : 5884592 : || !operation_no_trapping_overflow (type, tree_code (code)));
3395 : :
3396 : 829 : if (SAT_FIXED_POINT_TYPE_P (type))
3397 : : return true;
3398 : :
3399 : : return false;
3400 : : }
3401 : :
3402 : : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3403 : : has a handled computation expression. Store the main reduction
3404 : : operation in *CODE. */
3405 : :
3406 : : static bool
3407 : 64285 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3408 : : tree loop_arg, code_helper *code,
3409 : : vec<std::pair<ssa_op_iter, use_operand_p> > &path,
3410 : : bool inner_loop_of_double_reduc)
3411 : : {
3412 : 64285 : auto_bitmap visited;
3413 : 64285 : tree lookfor = PHI_RESULT (phi);
3414 : 64285 : ssa_op_iter curri;
3415 : 64285 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3416 : 135068 : while (USE_FROM_PTR (curr) != loop_arg)
3417 : 6498 : curr = op_iter_next_use (&curri);
3418 : 64285 : curri.i = curri.numops;
3419 : 620230 : do
3420 : : {
3421 : 620230 : path.safe_push (std::make_pair (curri, curr));
3422 : 620230 : tree use = USE_FROM_PTR (curr);
3423 : 620230 : if (use == lookfor)
3424 : : break;
3425 : 556235 : gimple *def = SSA_NAME_DEF_STMT (use);
3426 : 556235 : if (gimple_nop_p (def)
3427 : 556235 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3428 : : {
3429 : 470724 : pop:
3430 : 470724 : do
3431 : : {
3432 : 470724 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3433 : 470724 : curri = x.first;
3434 : 470724 : curr = x.second;
3435 : 515955 : do
3436 : 515955 : curr = op_iter_next_use (&curri);
3437 : : /* Skip already visited or non-SSA operands (from iterating
3438 : : over PHI args). */
3439 : : while (curr != NULL_USE_OPERAND_P
3440 : 1031910 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3441 : 177382 : || ! bitmap_set_bit (visited,
3442 : 177382 : SSA_NAME_VERSION
3443 : : (USE_FROM_PTR (curr)))));
3444 : : }
3445 : 941448 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3446 : 158454 : if (curr == NULL_USE_OPERAND_P)
3447 : : break;
3448 : : }
3449 : : else
3450 : : {
3451 : 466760 : if (gimple_code (def) == GIMPLE_PHI)
3452 : 48682 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3453 : : else
3454 : 418078 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3455 : : while (curr != NULL_USE_OPERAND_P
3456 : 561044 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3457 : 488243 : || ! bitmap_set_bit (visited,
3458 : 488243 : SSA_NAME_VERSION
3459 : : (USE_FROM_PTR (curr)))))
3460 : 94284 : curr = op_iter_next_use (&curri);
3461 : 466760 : if (curr == NULL_USE_OPERAND_P)
3462 : 68979 : goto pop;
3463 : : }
3464 : : }
3465 : : while (1);
3466 : 64285 : if (dump_file && (dump_flags & TDF_DETAILS))
3467 : : {
3468 : 3751 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3469 : 3751 : unsigned i;
3470 : 3751 : std::pair<ssa_op_iter, use_operand_p> *x;
3471 : 12815 : FOR_EACH_VEC_ELT (path, i, x)
3472 : 9064 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3473 : 3751 : dump_printf (MSG_NOTE, "\n");
3474 : : }
3475 : :
3476 : : /* Check whether the reduction path detected is valid. */
3477 : 64285 : bool fail = path.length () == 0;
3478 : 64285 : bool neg = false;
3479 : 64285 : int sign = -1;
3480 : 64285 : *code = ERROR_MARK;
3481 : 140801 : for (unsigned i = 1; i < path.length (); ++i)
3482 : : {
3483 : 79449 : gimple *use_stmt = USE_STMT (path[i].second);
3484 : 79449 : gimple_match_op op;
3485 : 79449 : if (!gimple_extract_op (use_stmt, &op))
3486 : : {
3487 : : fail = true;
3488 : 2933 : break;
3489 : : }
3490 : 78890 : unsigned int opi = op.num_ops;
3491 : 78890 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3492 : : {
3493 : : /* The following make sure we can compute the operand index
3494 : : easily plus it mostly disallows chaining via COND_EXPR condition
3495 : : operands. */
3496 : 124466 : for (opi = 0; opi < op.num_ops; ++opi)
3497 : 123519 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3498 : : break;
3499 : : }
3500 : 3506 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3501 : : {
3502 : 7032 : for (opi = 0; opi < op.num_ops; ++opi)
3503 : 7032 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3504 : : break;
3505 : : }
3506 : 78890 : if (opi == op.num_ops)
3507 : : {
3508 : : fail = true;
3509 : : break;
3510 : : }
3511 : 77943 : op.code = canonicalize_code (op.code, op.type);
3512 : 77943 : if (op.code == MINUS_EXPR)
3513 : : {
3514 : 3844 : op.code = PLUS_EXPR;
3515 : : /* Track whether we negate the reduction value each iteration. */
3516 : 3844 : if (op.ops[1] == op.ops[opi])
3517 : 32 : neg = ! neg;
3518 : : }
3519 : 74099 : else if (op.code == IFN_COND_SUB)
3520 : : {
3521 : 2 : op.code = IFN_COND_ADD;
3522 : : /* Track whether we negate the reduction value each iteration. */
3523 : 2 : if (op.ops[2] == op.ops[opi])
3524 : 0 : neg = ! neg;
3525 : : }
3526 : : /* For an FMA the reduction code is the PLUS if the addition chain
3527 : : is the reduction. */
3528 : 74097 : else if (op.code == IFN_FMA && opi == 2)
3529 : 28 : op.code = PLUS_EXPR;
3530 : 77943 : if (CONVERT_EXPR_CODE_P (op.code)
3531 : 77943 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3532 : : ;
3533 : 74523 : else if (*code == ERROR_MARK)
3534 : : {
3535 : 62572 : *code = op.code;
3536 : 62572 : sign = TYPE_SIGN (op.type);
3537 : : }
3538 : 11951 : else if (op.code != *code)
3539 : : {
3540 : : fail = true;
3541 : : break;
3542 : : }
3543 : 10731 : else if ((op.code == MIN_EXPR
3544 : 10647 : || op.code == MAX_EXPR)
3545 : 10743 : && sign != TYPE_SIGN (op.type))
3546 : : {
3547 : : fail = true;
3548 : : break;
3549 : : }
3550 : : /* Check there's only a single stmt the op is used on. For the
3551 : : not value-changing tail and the last stmt allow out-of-loop uses,
3552 : : but not when this is the inner loop of a double reduction.
3553 : : ??? We could relax this and handle arbitrary live stmts by
3554 : : forcing a scalar epilogue for example. */
3555 : 76720 : imm_use_iterator imm_iter;
3556 : 76720 : use_operand_p use_p;
3557 : 76720 : gimple *op_use_stmt;
3558 : 76720 : unsigned cnt = 0;
3559 : 80196 : bool cond_fn_p = op.code.is_internal_fn ()
3560 : 3476 : && (conditional_internal_fn_code (internal_fn (op.code))
3561 : 76720 : != ERROR_MARK);
3562 : :
3563 : 255247 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3564 : : {
3565 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
3566 : : have op1 twice (once as definition, once as else) in the same
3567 : : operation. Enforce this. */
3568 : 101807 : if (cond_fn_p && op_use_stmt == use_stmt)
3569 : : {
3570 : 3420 : gcall *call = as_a<gcall *> (use_stmt);
3571 : 3420 : unsigned else_pos
3572 : 3420 : = internal_fn_else_index (internal_fn (op.code));
3573 : 3420 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
3574 : : {
3575 : : fail = true;
3576 : : break;
3577 : : }
3578 : 17100 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
3579 : : {
3580 : 13680 : if (j == else_pos)
3581 : 3420 : continue;
3582 : 10260 : if (gimple_call_arg (call, j) == op.ops[opi])
3583 : 3420 : cnt++;
3584 : : }
3585 : : }
3586 : 98387 : else if (!is_gimple_debug (op_use_stmt)
3587 : 98387 : && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
3588 : 1767 : || flow_bb_inside_loop_p (loop,
3589 : 1767 : gimple_bb (op_use_stmt))))
3590 : 147155 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3591 : 73582 : cnt++;
3592 : 76720 : }
3593 : :
3594 : 76720 : if (cnt != 1)
3595 : : {
3596 : : fail = true;
3597 : : break;
3598 : : }
3599 : : }
3600 : 67517 : return ! fail && ! neg && *code != ERROR_MARK;
3601 : 64285 : }
3602 : :
3603 : : bool
3604 : 21 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3605 : : tree loop_arg, enum tree_code code)
3606 : : {
3607 : 21 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3608 : 21 : code_helper code_;
3609 : 21 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
3610 : 21 : && code_ == code);
3611 : 21 : }
3612 : :
3613 : :
3614 : :
3615 : : /* Function vect_is_simple_reduction
3616 : :
3617 : : (1) Detect a cross-iteration def-use cycle that represents a simple
3618 : : reduction computation. We look for the following pattern:
3619 : :
3620 : : loop_header:
3621 : : a1 = phi < a0, a2 >
3622 : : a3 = ...
3623 : : a2 = operation (a3, a1)
3624 : :
3625 : : or
3626 : :
3627 : : a3 = ...
3628 : : loop_header:
3629 : : a1 = phi < a0, a2 >
3630 : : a2 = operation (a3, a1)
3631 : :
3632 : : such that:
3633 : : 1. operation is commutative and associative and it is safe to
3634 : : change the order of the computation
3635 : : 2. no uses for a2 in the loop (a2 is used out of the loop)
3636 : : 3. no uses of a1 in the loop besides the reduction operation
3637 : : 4. no uses of a1 outside the loop.
3638 : :
3639 : : Conditions 1,4 are tested here.
3640 : : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3641 : :
3642 : : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3643 : : nested cycles.
3644 : :
3645 : : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3646 : : reductions:
3647 : :
3648 : : a1 = phi < a0, a2 >
3649 : : inner loop (def of a3)
3650 : : a2 = phi < a3 >
3651 : :
3652 : : (4) Detect condition expressions, ie:
3653 : : for (int i = 0; i < N; i++)
3654 : : if (a[i] < val)
3655 : : ret_val = a[i];
3656 : :
3657 : : */
3658 : :
3659 : : static stmt_vec_info
3660 : 85295 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3661 : : gphi **double_reduc)
3662 : : {
3663 : 85295 : gphi *phi = as_a <gphi *> (phi_info->stmt);
3664 : 85295 : gimple *phi_use_stmt = NULL;
3665 : 85295 : imm_use_iterator imm_iter;
3666 : 85295 : use_operand_p use_p;
3667 : :
3668 : : /* When double_reduc is NULL we are testing the inner loop of a
3669 : : double reduction. */
3670 : 85295 : bool inner_loop_of_double_reduc = double_reduc == NULL;
3671 : 85295 : if (double_reduc)
3672 : 84313 : *double_reduc = NULL;
3673 : 85295 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3674 : :
3675 : 85295 : tree phi_name = PHI_RESULT (phi);
3676 : : /* ??? If there are no uses of the PHI result the inner loop reduction
3677 : : won't be detected as possibly double-reduction by vectorizable_reduction
3678 : : because that tries to walk the PHI arg from the preheader edge which
3679 : : can be constant. See PR60382. */
3680 : 85295 : if (has_zero_uses (phi_name))
3681 : : return NULL;
3682 : 85131 : class loop *loop = (gimple_bb (phi))->loop_father;
3683 : 85131 : unsigned nphi_def_loop_uses = 0;
3684 : 296471 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3685 : : {
3686 : 130013 : gimple *use_stmt = USE_STMT (use_p);
3687 : 130013 : if (is_gimple_debug (use_stmt))
3688 : 31221 : continue;
3689 : :
3690 : 98792 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3691 : : {
3692 : 3804 : if (dump_enabled_p ())
3693 : 30 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3694 : : "intermediate value used outside loop.\n");
3695 : :
3696 : 3804 : return NULL;
3697 : : }
3698 : :
3699 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
3700 : : op1 twice (once as definition, once as else) in the same operation.
3701 : : Only count it as one. */
3702 : 94988 : if (use_stmt != phi_use_stmt)
3703 : : {
3704 : 91307 : nphi_def_loop_uses++;
3705 : 91307 : phi_use_stmt = use_stmt;
3706 : : }
3707 : 3804 : }
3708 : :
3709 : 81327 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3710 : 81327 : if (TREE_CODE (latch_def) != SSA_NAME)
3711 : : {
3712 : 1231 : if (dump_enabled_p ())
3713 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3714 : : "reduction: not ssa_name: %T\n", latch_def);
3715 : 1231 : return NULL;
3716 : : }
3717 : :
3718 : 80096 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3719 : 80096 : if (!def_stmt_info
3720 : 80096 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3721 : 134 : return NULL;
3722 : :
3723 : 79962 : bool nested_in_vect_loop
3724 : 79962 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3725 : 79962 : unsigned nlatch_def_loop_uses = 0;
3726 : 79962 : auto_vec<gphi *, 3> lcphis;
3727 : 381170 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3728 : : {
3729 : 221246 : gimple *use_stmt = USE_STMT (use_p);
3730 : 221246 : if (is_gimple_debug (use_stmt))
3731 : 61062 : continue;
3732 : 160184 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3733 : 88597 : nlatch_def_loop_uses++;
3734 : : else
3735 : : /* We can have more than one loop-closed PHI. */
3736 : 71587 : lcphis.safe_push (as_a <gphi *> (use_stmt));
3737 : 79962 : }
3738 : :
3739 : : /* If we are vectorizing an inner reduction we are executing that
3740 : : in the original order only in case we are not dealing with a
3741 : : double reduction. */
3742 : 79962 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3743 : : {
3744 : 2181 : if (dump_enabled_p ())
3745 : 357 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3746 : : "detected nested cycle: ");
3747 : 2181 : return def_stmt_info;
3748 : : }
3749 : :
3750 : : /* When the inner loop of a double reduction ends up with more than
3751 : : one loop-closed PHI we have failed to classify alternate such
3752 : : PHIs as double reduction, leading to wrong code. See PR103237. */
3753 : 78751 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
3754 : : {
3755 : 1 : if (dump_enabled_p ())
3756 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3757 : : "unhandle double reduction\n");
3758 : 1 : return NULL;
3759 : : }
3760 : :
3761 : : /* If this isn't a nested cycle or if the nested cycle reduction value
3762 : : is used ouside of the inner loop we cannot handle uses of the reduction
3763 : : value. */
3764 : 77780 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3765 : : {
3766 : 12382 : if (dump_enabled_p ())
3767 : 314 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3768 : : "reduction used in loop.\n");
3769 : 12382 : return NULL;
3770 : : }
3771 : :
3772 : : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3773 : : defined in the inner loop. */
3774 : 65398 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3775 : : {
3776 : 1134 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
3777 : 1134 : if (gimple_phi_num_args (def_stmt) != 1
3778 : 1134 : || TREE_CODE (op1) != SSA_NAME)
3779 : : {
3780 : 38 : if (dump_enabled_p ())
3781 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3782 : : "unsupported phi node definition.\n");
3783 : :
3784 : 38 : return NULL;
3785 : : }
3786 : :
3787 : : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3788 : : and the latch definition op1. */
3789 : 1096 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
3790 : 1096 : if (gimple_bb (def1)
3791 : 1096 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3792 : 1096 : && loop->inner
3793 : 1088 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3794 : 1088 : && (is_gimple_assign (def1) || is_gimple_call (def1))
3795 : 1079 : && is_a <gphi *> (phi_use_stmt)
3796 : 1068 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
3797 : 1068 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
3798 : : loop_latch_edge (loop->inner)))
3799 : 2162 : && lcphis.length () == 1)
3800 : : {
3801 : 982 : if (dump_enabled_p ())
3802 : 132 : report_vect_op (MSG_NOTE, def_stmt,
3803 : : "detected double reduction: ");
3804 : :
3805 : 982 : *double_reduc = as_a <gphi *> (phi_use_stmt);
3806 : 982 : return def_stmt_info;
3807 : : }
3808 : :
3809 : 114 : return NULL;
3810 : : }
3811 : :
3812 : : /* Look for the expression computing latch_def from then loop PHI result. */
3813 : 64264 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3814 : 64264 : code_helper code;
3815 : 64264 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3816 : : path, inner_loop_of_double_reduc))
3817 : : {
3818 : 61032 : STMT_VINFO_REDUC_CODE (phi_info) = code;
3819 : 61032 : if (code == COND_EXPR && !nested_in_vect_loop)
3820 : 4184 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3821 : :
3822 : : /* Fill in STMT_VINFO_REDUC_IDX. */
3823 : 61032 : unsigned i;
3824 : 197074 : for (i = path.length () - 1; i >= 1; --i)
3825 : : {
3826 : 75010 : gimple *stmt = USE_STMT (path[i].second);
3827 : 75010 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3828 : 75010 : gimple_match_op op;
3829 : 75010 : if (!gimple_extract_op (stmt, &op))
3830 : 0 : gcc_unreachable ();
3831 : 75010 : if (gassign *assign = dyn_cast<gassign *> (stmt))
3832 : 71524 : STMT_VINFO_REDUC_IDX (stmt_info)
3833 : 71524 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3834 : : else
3835 : : {
3836 : 3486 : gcall *call = as_a<gcall *> (stmt);
3837 : 3486 : STMT_VINFO_REDUC_IDX (stmt_info)
3838 : 3486 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
3839 : : }
3840 : : }
3841 : 61032 : if (dump_enabled_p ())
3842 : 3695 : dump_printf_loc (MSG_NOTE, vect_location,
3843 : : "reduction: detected reduction\n");
3844 : :
3845 : 61032 : return def_stmt_info;
3846 : : }
3847 : :
3848 : 3232 : if (dump_enabled_p ())
3849 : 80 : dump_printf_loc (MSG_NOTE, vect_location,
3850 : : "reduction: unknown pattern\n");
3851 : :
3852 : : return NULL;
3853 : 144226 : }
3854 : :
3855 : : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3856 : : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3857 : : or -1 if not known. */
3858 : :
3859 : : static int
3860 : 347122 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3861 : : {
3862 : 347122 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3863 : 347122 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3864 : : {
3865 : 138665 : if (dump_enabled_p ())
3866 : 2800 : dump_printf_loc (MSG_NOTE, vect_location,
3867 : : "cost model: epilogue peel iters set to vf/2 "
3868 : : "because loop iterations are unknown .\n");
3869 : 138665 : return assumed_vf / 2;
3870 : : }
3871 : : else
3872 : : {
3873 : 208457 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3874 : 208457 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
3875 : 208457 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3876 : : /* If we need to peel for gaps, but no peeling is required, we have to
3877 : : peel VF iterations. */
3878 : 208457 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3879 : 208457 : peel_iters_epilogue = assumed_vf;
3880 : 208457 : return peel_iters_epilogue;
3881 : : }
3882 : : }
3883 : :
3884 : : /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3885 : : int
3886 : 263901 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3887 : : int *peel_iters_epilogue,
3888 : : stmt_vector_for_cost *scalar_cost_vec,
3889 : : stmt_vector_for_cost *prologue_cost_vec,
3890 : : stmt_vector_for_cost *epilogue_cost_vec)
3891 : : {
3892 : 263901 : int retval = 0;
3893 : :
3894 : 263901 : *peel_iters_epilogue
3895 : 263901 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3896 : :
3897 : 263901 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3898 : : {
3899 : : /* If peeled iterations are known but number of scalar loop
3900 : : iterations are unknown, count a taken branch per peeled loop. */
3901 : 88842 : if (peel_iters_prologue > 0)
3902 : 53200 : retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3903 : : vect_prologue);
3904 : 88842 : if (*peel_iters_epilogue > 0)
3905 : 88767 : retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3906 : : vect_epilogue);
3907 : : }
3908 : :
3909 : 263901 : stmt_info_for_cost *si;
3910 : 263901 : int j;
3911 : 263901 : if (peel_iters_prologue)
3912 : 650381 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3913 : 539181 : retval += record_stmt_cost (prologue_cost_vec,
3914 : 539181 : si->count * peel_iters_prologue,
3915 : : si->kind, si->stmt_info, si->misalign,
3916 : : vect_prologue);
3917 : 263901 : if (*peel_iters_epilogue)
3918 : 1014384 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3919 : 838767 : retval += record_stmt_cost (epilogue_cost_vec,
3920 : 838767 : si->count * *peel_iters_epilogue,
3921 : : si->kind, si->stmt_info, si->misalign,
3922 : : vect_epilogue);
3923 : :
3924 : 263901 : return retval;
3925 : : }
3926 : :
3927 : : /* Function vect_estimate_min_profitable_iters
3928 : :
3929 : : Return the number of iterations required for the vector version of the
3930 : : loop to be profitable relative to the cost of the scalar version of the
3931 : : loop.
3932 : :
3933 : : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3934 : : of iterations for vectorization. -1 value means loop vectorization
3935 : : is not profitable. This returned value may be used for dynamic
3936 : : profitability check.
3937 : :
3938 : : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3939 : : for static check against estimated number of iterations. */
3940 : :
3941 : : static void
3942 : 99768 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3943 : : int *ret_min_profitable_niters,
3944 : : int *ret_min_profitable_estimate,
3945 : : unsigned *suggested_unroll_factor)
3946 : : {
3947 : 99768 : int min_profitable_iters;
3948 : 99768 : int min_profitable_estimate;
3949 : 99768 : int peel_iters_prologue;
3950 : 99768 : int peel_iters_epilogue;
3951 : 99768 : unsigned vec_inside_cost = 0;
3952 : 99768 : int vec_outside_cost = 0;
3953 : 99768 : unsigned vec_prologue_cost = 0;
3954 : 99768 : unsigned vec_epilogue_cost = 0;
3955 : 99768 : int scalar_single_iter_cost = 0;
3956 : 99768 : int scalar_outside_cost = 0;
3957 : 99768 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3958 : 99768 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3959 : 99768 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
3960 : :
3961 : : /* Cost model disabled. */
3962 : 99768 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3963 : : {
3964 : 16342 : if (dump_enabled_p ())
3965 : 10063 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3966 : 16342 : *ret_min_profitable_niters = 0;
3967 : 16342 : *ret_min_profitable_estimate = 0;
3968 : 16342 : return;
3969 : : }
3970 : :
3971 : : /* Requires loop versioning tests to handle misalignment. */
3972 : 83426 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3973 : : {
3974 : : /* FIXME: Make cost depend on complexity of individual check. */
3975 : 28 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3976 : 28 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3977 : 28 : if (dump_enabled_p ())
3978 : 1 : dump_printf (MSG_NOTE,
3979 : : "cost model: Adding cost of checks for loop "
3980 : : "versioning to treat misalignment.\n");
3981 : : }
3982 : :
3983 : : /* Requires loop versioning with alias checks. */
3984 : 83426 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3985 : : {
3986 : : /* FIXME: Make cost depend on complexity of individual check. */
3987 : 4044 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3988 : 4044 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3989 : 4044 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3990 : 0 : if (len)
3991 : : /* Count LEN - 1 ANDs and LEN comparisons. */
3992 : 0 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3993 : : scalar_stmt, vect_prologue);
3994 : 4044 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3995 : 1090 : if (len)
3996 : : {
3997 : : /* Count LEN - 1 ANDs and LEN comparisons. */
3998 : 1090 : unsigned int nstmts = len * 2 - 1;
3999 : : /* +1 for each bias that needs adding. */
4000 : 2180 : for (unsigned int i = 0; i < len; ++i)
4001 : 1090 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4002 : 123 : nstmts += 1;
4003 : 1090 : (void) add_stmt_cost (target_cost_data, nstmts,
4004 : : scalar_stmt, vect_prologue);
4005 : : }
4006 : 4044 : if (dump_enabled_p ())
4007 : 16 : dump_printf (MSG_NOTE,
4008 : : "cost model: Adding cost of checks for loop "
4009 : : "versioning aliasing.\n");
4010 : : }
4011 : :
4012 : : /* Requires loop versioning with niter checks. */
4013 : 83426 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4014 : : {
4015 : : /* FIXME: Make cost depend on complexity of individual check. */
4016 : 664 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4017 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4018 : 664 : if (dump_enabled_p ())
4019 : 1 : dump_printf (MSG_NOTE,
4020 : : "cost model: Adding cost of checks for loop "
4021 : : "versioning niters.\n");
4022 : : }
4023 : :
4024 : 83426 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4025 : 4722 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4026 : : vect_prologue);
4027 : :
4028 : : /* Count statements in scalar loop. Using this as scalar cost for a single
4029 : : iteration for now.
4030 : :
4031 : : TODO: Add outer loop support.
4032 : :
4033 : : TODO: Consider assigning different costs to different scalar
4034 : : statements. */
4035 : :
4036 : 83426 : scalar_single_iter_cost = (loop_vinfo->scalar_costs->total_cost ()
4037 : 83426 : * param_vect_scalar_cost_multiplier) / 100;
4038 : :
4039 : : /* Add additional cost for the peeled instructions in prologue and epilogue
4040 : : loop. (For fully-masked loops there will be no peeling.)
4041 : :
4042 : : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4043 : : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4044 : :
4045 : : TODO: Build an expression that represents peel_iters for prologue and
4046 : : epilogue to be used in a run-time test. */
4047 : :
4048 : 83426 : bool prologue_need_br_taken_cost = false;
4049 : 83426 : bool prologue_need_br_not_taken_cost = false;
4050 : :
4051 : : /* Calculate peel_iters_prologue. */
4052 : 83426 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4053 : : peel_iters_prologue = 0;
4054 : 83426 : else if (npeel < 0)
4055 : : {
4056 : 183 : peel_iters_prologue = assumed_vf / 2;
4057 : 183 : if (dump_enabled_p ())
4058 : 4 : dump_printf (MSG_NOTE, "cost model: "
4059 : : "prologue peel iters set to vf/2.\n");
4060 : :
4061 : : /* If peeled iterations are unknown, count a taken branch and a not taken
4062 : : branch per peeled loop. Even if scalar loop iterations are known,
4063 : : vector iterations are not known since peeled prologue iterations are
4064 : : not known. Hence guards remain the same. */
4065 : : prologue_need_br_taken_cost = true;
4066 : : prologue_need_br_not_taken_cost = true;
4067 : : }
4068 : : else
4069 : : {
4070 : 83243 : peel_iters_prologue = npeel;
4071 : 83243 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4072 : : /* If peeled iterations are known but number of scalar loop
4073 : : iterations are unknown, count a taken branch per peeled loop. */
4074 : 83426 : prologue_need_br_taken_cost = true;
4075 : : }
4076 : :
4077 : 83426 : bool epilogue_need_br_taken_cost = false;
4078 : 83426 : bool epilogue_need_br_not_taken_cost = false;
4079 : :
4080 : : /* Calculate peel_iters_epilogue. */
4081 : 83426 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4082 : : /* We need to peel exactly one iteration for gaps. */
4083 : 22 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4084 : 83404 : else if (npeel < 0)
4085 : : {
4086 : : /* If peeling for alignment is unknown, loop bound of main loop
4087 : : becomes unknown. */
4088 : 183 : peel_iters_epilogue = assumed_vf / 2;
4089 : 183 : if (dump_enabled_p ())
4090 : 4 : dump_printf (MSG_NOTE, "cost model: "
4091 : : "epilogue peel iters set to vf/2 because "
4092 : : "peeling for alignment is unknown.\n");
4093 : :
4094 : : /* See the same reason above in peel_iters_prologue calculation. */
4095 : : epilogue_need_br_taken_cost = true;
4096 : : epilogue_need_br_not_taken_cost = true;
4097 : : }
4098 : : else
4099 : : {
4100 : 83221 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4101 : 83221 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4102 : : /* If peeled iterations are known but number of scalar loop
4103 : : iterations are unknown, count a taken branch per peeled loop. */
4104 : 83426 : epilogue_need_br_taken_cost = true;
4105 : : }
4106 : :
4107 : 83426 : stmt_info_for_cost *si;
4108 : 83426 : int j;
4109 : : /* Add costs associated with peel_iters_prologue. */
4110 : 83426 : if (peel_iters_prologue)
4111 : 871 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4112 : : {
4113 : 679 : (void) add_stmt_cost (target_cost_data,
4114 : 679 : si->count * peel_iters_prologue, si->kind,
4115 : : si->stmt_info, si->node, si->vectype,
4116 : : si->misalign, vect_prologue);
4117 : : }
4118 : :
4119 : : /* Add costs associated with peel_iters_epilogue. */
4120 : 83426 : if (peel_iters_epilogue)
4121 : 283442 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4122 : : {
4123 : 224501 : (void) add_stmt_cost (target_cost_data,
4124 : 224501 : si->count * peel_iters_epilogue, si->kind,
4125 : : si->stmt_info, si->node, si->vectype,
4126 : : si->misalign, vect_epilogue);
4127 : : }
4128 : :
4129 : : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4130 : :
4131 : 83426 : if (prologue_need_br_taken_cost)
4132 : 184 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4133 : : vect_prologue);
4134 : :
4135 : 83426 : if (prologue_need_br_not_taken_cost)
4136 : 183 : (void) add_stmt_cost (target_cost_data, 1,
4137 : : cond_branch_not_taken, vect_prologue);
4138 : :
4139 : 83426 : if (epilogue_need_br_taken_cost)
4140 : 49419 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4141 : : vect_epilogue);
4142 : :
4143 : 83426 : if (epilogue_need_br_not_taken_cost)
4144 : 183 : (void) add_stmt_cost (target_cost_data, 1,
4145 : : cond_branch_not_taken, vect_epilogue);
4146 : :
4147 : : /* Take care of special costs for rgroup controls of partial vectors. */
4148 : 22 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4149 : 83448 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4150 : : == vect_partial_vectors_avx512))
4151 : : {
4152 : : /* Calculate how many masks we need to generate. */
4153 : 22 : unsigned int num_masks = 0;
4154 : 22 : bool need_saturation = false;
4155 : 90 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4156 : 24 : if (rgm.type)
4157 : : {
4158 : 22 : unsigned nvectors = rgm.factor;
4159 : 22 : num_masks += nvectors;
4160 : 22 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4161 : 22 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4162 : 7 : need_saturation = true;
4163 : : }
4164 : :
4165 : : /* ??? The target isn't able to identify the costs below as
4166 : : producing masks so it cannot penaltize cases where we'd run
4167 : : out of mask registers for example. */
4168 : :
4169 : : /* ??? We are also failing to account for smaller vector masks
4170 : : we generate by splitting larger masks in vect_get_loop_mask. */
4171 : :
4172 : : /* In the worst case, we need to generate each mask in the prologue
4173 : : and in the loop body. We need one splat per group and one
4174 : : compare per mask.
4175 : :
4176 : : Sometimes the prologue mask will fold to a constant,
4177 : : so the actual prologue cost might be smaller. However, it's
4178 : : simpler and safer to use the worst-case cost; if this ends up
4179 : : being the tie-breaker between vectorizing or not, then it's
4180 : : probably better not to vectorize. */
4181 : 22 : (void) add_stmt_cost (target_cost_data,
4182 : : num_masks
4183 : 22 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4184 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4185 : : vect_prologue);
4186 : 44 : (void) add_stmt_cost (target_cost_data,
4187 : : num_masks
4188 : 44 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4189 : : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4190 : :
4191 : : /* When we need saturation we need it both in the prologue and
4192 : : the epilogue. */
4193 : 22 : if (need_saturation)
4194 : : {
4195 : 7 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4196 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4197 : 7 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4198 : : NULL, NULL, NULL_TREE, 0, vect_body);
4199 : : }
4200 : : }
4201 : 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4202 : 83404 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4203 : : == vect_partial_vectors_while_ult))
4204 : : {
4205 : : /* Calculate how many masks we need to generate. */
4206 : : unsigned int num_masks = 0;
4207 : : rgroup_controls *rgm;
4208 : : unsigned int num_vectors_m1;
4209 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4210 : : num_vectors_m1, rgm)
4211 : 0 : if (rgm->type)
4212 : 0 : num_masks += num_vectors_m1 + 1;
4213 : 0 : gcc_assert (num_masks > 0);
4214 : :
4215 : : /* In the worst case, we need to generate each mask in the prologue
4216 : : and in the loop body. One of the loop body mask instructions
4217 : : replaces the comparison in the scalar loop, and since we don't
4218 : : count the scalar comparison against the scalar body, we shouldn't
4219 : : count that vector instruction against the vector body either.
4220 : :
4221 : : Sometimes we can use unpacks instead of generating prologue
4222 : : masks and sometimes the prologue mask will fold to a constant,
4223 : : so the actual prologue cost might be smaller. However, it's
4224 : : simpler and safer to use the worst-case cost; if this ends up
4225 : : being the tie-breaker between vectorizing or not, then it's
4226 : : probably better not to vectorize. */
4227 : 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4228 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4229 : : vect_prologue);
4230 : 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4231 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4232 : : vect_body);
4233 : : }
4234 : 83404 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4235 : : {
4236 : : /* Referring to the functions vect_set_loop_condition_partial_vectors
4237 : : and vect_set_loop_controls_directly, we need to generate each
4238 : : length in the prologue and in the loop body if required. Although
4239 : : there are some possible optimizations, we consider the worst case
4240 : : here. */
4241 : :
4242 : 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4243 : 0 : signed char partial_load_store_bias
4244 : : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4245 : 0 : bool need_iterate_p
4246 : 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4247 : 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4248 : :
4249 : : /* Calculate how many statements to be added. */
4250 : 0 : unsigned int prologue_stmts = 0;
4251 : 0 : unsigned int body_stmts = 0;
4252 : :
4253 : 0 : rgroup_controls *rgc;
4254 : 0 : unsigned int num_vectors_m1;
4255 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4256 : 0 : if (rgc->type)
4257 : : {
4258 : : /* May need one SHIFT for nitems_total computation. */
4259 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4260 : 0 : if (nitems != 1 && !niters_known_p)
4261 : 0 : prologue_stmts += 1;
4262 : :
4263 : : /* May need one MAX and one MINUS for wrap around. */
4264 : 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4265 : 0 : prologue_stmts += 2;
4266 : :
4267 : : /* Need one MAX and one MINUS for each batch limit excepting for
4268 : : the 1st one. */
4269 : 0 : prologue_stmts += num_vectors_m1 * 2;
4270 : :
4271 : 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4272 : :
4273 : : /* Need to set up lengths in prologue, only one MIN required
4274 : : for each since start index is zero. */
4275 : 0 : prologue_stmts += num_vectors;
4276 : :
4277 : : /* If we have a non-zero partial load bias, we need one PLUS
4278 : : to adjust the load length. */
4279 : 0 : if (partial_load_store_bias != 0)
4280 : 0 : body_stmts += 1;
4281 : :
4282 : 0 : unsigned int length_update_cost = 0;
4283 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4284 : : /* For decrement IV style, Each only need a single SELECT_VL
4285 : : or MIN since beginning to calculate the number of elements
4286 : : need to be processed in current iteration. */
4287 : : length_update_cost = 1;
4288 : : else
4289 : : /* For increment IV stype, Each may need two MINs and one MINUS to
4290 : : update lengths in body for next iteration. */
4291 : 0 : length_update_cost = 3;
4292 : :
4293 : 0 : if (need_iterate_p)
4294 : 0 : body_stmts += length_update_cost * num_vectors;
4295 : : }
4296 : :
4297 : 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4298 : : scalar_stmt, vect_prologue);
4299 : 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4300 : : scalar_stmt, vect_body);
4301 : : }
4302 : :
4303 : : /* FORNOW: The scalar outside cost is incremented in one of the
4304 : : following ways:
4305 : :
4306 : : 1. The vectorizer checks for alignment and aliasing and generates
4307 : : a condition that allows dynamic vectorization. A cost model
4308 : : check is ANDED with the versioning condition. Hence scalar code
4309 : : path now has the added cost of the versioning check.
4310 : :
4311 : : if (cost > th & versioning_check)
4312 : : jmp to vector code
4313 : :
4314 : : Hence run-time scalar is incremented by not-taken branch cost.
4315 : :
4316 : : 2. The vectorizer then checks if a prologue is required. If the
4317 : : cost model check was not done before during versioning, it has to
4318 : : be done before the prologue check.
4319 : :
4320 : : if (cost <= th)
4321 : : prologue = scalar_iters
4322 : : if (prologue == 0)
4323 : : jmp to vector code
4324 : : else
4325 : : execute prologue
4326 : : if (prologue == num_iters)
4327 : : go to exit
4328 : :
4329 : : Hence the run-time scalar cost is incremented by a taken branch,
4330 : : plus a not-taken branch, plus a taken branch cost.
4331 : :
4332 : : 3. The vectorizer then checks if an epilogue is required. If the
4333 : : cost model check was not done before during prologue check, it
4334 : : has to be done with the epilogue check.
4335 : :
4336 : : if (prologue == 0)
4337 : : jmp to vector code
4338 : : else
4339 : : execute prologue
4340 : : if (prologue == num_iters)
4341 : : go to exit
4342 : : vector code:
4343 : : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4344 : : jmp to epilogue
4345 : :
4346 : : Hence the run-time scalar cost should be incremented by 2 taken
4347 : : branches.
4348 : :
4349 : : TODO: The back end may reorder the BBS's differently and reverse
4350 : : conditions/branch directions. Change the estimates below to
4351 : : something more reasonable. */
4352 : :
4353 : : /* If the number of iterations is known and we do not do versioning, we can
4354 : : decide whether to vectorize at compile time. Hence the scalar version
4355 : : do not carry cost model guard costs. */
4356 : 33442 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4357 : 116868 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4358 : : {
4359 : : /* Cost model check occurs at versioning. */
4360 : 50588 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4361 : 4722 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4362 : : else
4363 : : {
4364 : : /* Cost model check occurs at prologue generation. */
4365 : 45866 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4366 : 38 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4367 : 38 : + vect_get_stmt_cost (cond_branch_not_taken);
4368 : : /* Cost model check occurs at epilogue generation. */
4369 : : else
4370 : 45828 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4371 : : }
4372 : : }
4373 : :
4374 : : /* Complete the target-specific cost calculations. */
4375 : 83426 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
4376 : 83426 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
4377 : 83426 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
4378 : 83426 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
4379 : 83426 : if (suggested_unroll_factor)
4380 : 83239 : *suggested_unroll_factor
4381 : 83239 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
4382 : :
4383 : 83239 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
4384 : 233 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4385 : 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4386 : : *suggested_unroll_factor,
4387 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4388 : : {
4389 : 0 : if (dump_enabled_p ())
4390 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4391 : : "can't unroll as unrolled vectorization factor larger"
4392 : : " than maximum vectorization factor: "
4393 : : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4394 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4395 : 0 : *suggested_unroll_factor = 1;
4396 : : }
4397 : :
4398 : 83426 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4399 : :
4400 : 83426 : if (dump_enabled_p ())
4401 : : {
4402 : 627 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4403 : 627 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4404 : : vec_inside_cost);
4405 : 627 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4406 : : vec_prologue_cost);
4407 : 627 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4408 : : vec_epilogue_cost);
4409 : 627 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4410 : : scalar_single_iter_cost);
4411 : 627 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4412 : : scalar_outside_cost);
4413 : 627 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4414 : : vec_outside_cost);
4415 : 627 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4416 : : peel_iters_prologue);
4417 : 627 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4418 : : peel_iters_epilogue);
4419 : : }
4420 : :
4421 : : /* Calculate number of iterations required to make the vector version
4422 : : profitable, relative to the loop bodies only. The following condition
4423 : : must hold true:
4424 : : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4425 : : where
4426 : : SIC = scalar iteration cost, VIC = vector iteration cost,
4427 : : VOC = vector outside cost, VF = vectorization factor,
4428 : : NPEEL = prologue iterations + epilogue iterations,
4429 : : SOC = scalar outside cost for run time cost model check. */
4430 : :
4431 : 83426 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4432 : 83426 : - vec_inside_cost);
4433 : 83426 : if (saving_per_viter <= 0)
4434 : : {
4435 : 25793 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4436 : 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4437 : : "vectorization did not happen for a simd loop");
4438 : :
4439 : 25793 : if (dump_enabled_p ())
4440 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4441 : : "cost model: the vector iteration cost = %d "
4442 : : "divided by the scalar iteration cost = %d "
4443 : : "is greater or equal to the vectorization factor = %d"
4444 : : ".\n",
4445 : : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4446 : 25793 : *ret_min_profitable_niters = -1;
4447 : 25793 : *ret_min_profitable_estimate = -1;
4448 : 25793 : return;
4449 : : }
4450 : :
4451 : : /* ??? The "if" arm is written to handle all cases; see below for what
4452 : : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4453 : 57633 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4454 : : {
4455 : : /* Rewriting the condition above in terms of the number of
4456 : : vector iterations (vniters) rather than the number of
4457 : : scalar iterations (niters) gives:
4458 : :
4459 : : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4460 : :
4461 : : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4462 : :
4463 : : For integer N, X and Y when X > 0:
4464 : :
4465 : : N * X > Y <==> N >= (Y /[floor] X) + 1. */
4466 : 14 : int outside_overhead = (vec_outside_cost
4467 : 14 : - scalar_single_iter_cost * peel_iters_prologue
4468 : 14 : - scalar_single_iter_cost * peel_iters_epilogue
4469 : : - scalar_outside_cost);
4470 : : /* We're only interested in cases that require at least one
4471 : : vector iteration. */
4472 : 14 : int min_vec_niters = 1;
4473 : 14 : if (outside_overhead > 0)
4474 : 11 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4475 : :
4476 : 14 : if (dump_enabled_p ())
4477 : 6 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4478 : : min_vec_niters);
4479 : :
4480 : 14 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4481 : : {
4482 : : /* Now that we know the minimum number of vector iterations,
4483 : : find the minimum niters for which the scalar cost is larger:
4484 : :
4485 : : SIC * niters > VIC * vniters + VOC - SOC
4486 : :
4487 : : We know that the minimum niters is no more than
4488 : : vniters * VF + NPEEL, but it might be (and often is) less
4489 : : than that if a partial vector iteration is cheaper than the
4490 : : equivalent scalar code. */
4491 : 14 : int threshold = (vec_inside_cost * min_vec_niters
4492 : 14 : + vec_outside_cost
4493 : 14 : - scalar_outside_cost);
4494 : 14 : if (threshold <= 0)
4495 : : min_profitable_iters = 1;
4496 : : else
4497 : 14 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4498 : : }
4499 : : else
4500 : : /* Convert the number of vector iterations into a number of
4501 : : scalar iterations. */
4502 : 0 : min_profitable_iters = (min_vec_niters * assumed_vf
4503 : 0 : + peel_iters_prologue
4504 : : + peel_iters_epilogue);
4505 : : }
4506 : : else
4507 : : {
4508 : 57619 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4509 : 57619 : * assumed_vf
4510 : 57619 : - vec_inside_cost * peel_iters_prologue
4511 : 57619 : - vec_inside_cost * peel_iters_epilogue);
4512 : 57619 : if (min_profitable_iters <= 0)
4513 : : min_profitable_iters = 0;
4514 : : else
4515 : : {
4516 : 48588 : min_profitable_iters /= saving_per_viter;
4517 : :
4518 : 48588 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4519 : 48588 : <= (((int) vec_inside_cost * min_profitable_iters)
4520 : 48588 : + (((int) vec_outside_cost - scalar_outside_cost)
4521 : : * assumed_vf)))
4522 : 48588 : min_profitable_iters++;
4523 : : }
4524 : : }
4525 : :
4526 : 57633 : if (dump_enabled_p ())
4527 : 605 : dump_printf (MSG_NOTE,
4528 : : " Calculated minimum iters for profitability: %d\n",
4529 : : min_profitable_iters);
4530 : :
4531 : 57633 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4532 : 57619 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4533 : : /* We want the vectorized loop to execute at least once. */
4534 : : min_profitable_iters = assumed_vf + peel_iters_prologue;
4535 : 10580 : else if (min_profitable_iters < peel_iters_prologue)
4536 : : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4537 : : vectorized loop executes at least once. */
4538 : : min_profitable_iters = peel_iters_prologue;
4539 : :
4540 : 57633 : if (dump_enabled_p ())
4541 : 605 : dump_printf_loc (MSG_NOTE, vect_location,
4542 : : " Runtime profitability threshold = %d\n",
4543 : : min_profitable_iters);
4544 : :
4545 : 57633 : *ret_min_profitable_niters = min_profitable_iters;
4546 : :
4547 : : /* Calculate number of iterations required to make the vector version
4548 : : profitable, relative to the loop bodies only.
4549 : :
4550 : : Non-vectorized variant is SIC * niters and it must win over vector
4551 : : variant on the expected loop trip count. The following condition must hold true:
4552 : : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4553 : :
4554 : 57633 : if (vec_outside_cost <= 0)
4555 : : min_profitable_estimate = 0;
4556 : : /* ??? This "else if" arm is written to handle all cases; see below for
4557 : : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4558 : 52252 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4559 : : {
4560 : : /* This is a repeat of the code above, but with + SOC rather
4561 : : than - SOC. */
4562 : 14 : int outside_overhead = (vec_outside_cost
4563 : 14 : - scalar_single_iter_cost * peel_iters_prologue
4564 : 14 : - scalar_single_iter_cost * peel_iters_epilogue
4565 : : + scalar_outside_cost);
4566 : 14 : int min_vec_niters = 1;
4567 : 14 : if (outside_overhead > 0)
4568 : 14 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4569 : :
4570 : 14 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4571 : : {
4572 : 14 : int threshold = (vec_inside_cost * min_vec_niters
4573 : 14 : + vec_outside_cost
4574 : 14 : + scalar_outside_cost);
4575 : 14 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4576 : : }
4577 : : else
4578 : : min_profitable_estimate = (min_vec_niters * assumed_vf
4579 : : + peel_iters_prologue
4580 : : + peel_iters_epilogue);
4581 : : }
4582 : : else
4583 : : {
4584 : 52238 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4585 : 52238 : * assumed_vf
4586 : 52238 : - vec_inside_cost * peel_iters_prologue
4587 : 52238 : - vec_inside_cost * peel_iters_epilogue)
4588 : 52238 : / ((scalar_single_iter_cost * assumed_vf)
4589 : : - vec_inside_cost);
4590 : : }
4591 : 57633 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4592 : 57633 : if (dump_enabled_p ())
4593 : 605 : dump_printf_loc (MSG_NOTE, vect_location,
4594 : : " Static estimate profitability threshold = %d\n",
4595 : : min_profitable_estimate);
4596 : :
4597 : 57633 : *ret_min_profitable_estimate = min_profitable_estimate;
4598 : : }
4599 : :
4600 : : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4601 : : vector elements (not bits) for a vector with NELT elements. */
4602 : : static void
4603 : 2137 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4604 : : vec_perm_builder *sel)
4605 : : {
4606 : : /* The encoding is a single stepped pattern. Any wrap-around is handled
4607 : : by vec_perm_indices. */
4608 : 2137 : sel->new_vector (nelt, 1, 3);
4609 : 8548 : for (unsigned int i = 0; i < 3; i++)
4610 : 6411 : sel->quick_push (i + offset);
4611 : 2137 : }
4612 : :
4613 : : /* Checks whether the target supports whole-vector shifts for vectors of mode
4614 : : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4615 : : it supports vec_perm_const with masks for all necessary shift amounts. */
4616 : : static bool
4617 : 7502 : have_whole_vector_shift (machine_mode mode)
4618 : : {
4619 : 7502 : if (can_implement_p (vec_shr_optab, mode))
4620 : : return true;
4621 : :
4622 : : /* Variable-length vectors should be handled via the optab. */
4623 : 61 : unsigned int nelt;
4624 : 122 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4625 : : return false;
4626 : :
4627 : 61 : vec_perm_builder sel;
4628 : 61 : vec_perm_indices indices;
4629 : 307 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4630 : : {
4631 : 246 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
4632 : 246 : indices.new_vector (sel, 2, nelt);
4633 : 246 : if (!can_vec_perm_const_p (mode, mode, indices, false))
4634 : : return false;
4635 : : }
4636 : : return true;
4637 : 61 : }
4638 : :
4639 : : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4640 : : multiplication operands have differing signs and (b) we intend
4641 : : to emulate the operation using a series of signed DOT_PROD_EXPRs.
4642 : : See vect_emulate_mixed_dot_prod for the actual sequence used. */
4643 : :
4644 : : static bool
4645 : 2179 : vect_is_emulated_mixed_dot_prod (slp_tree slp_node)
4646 : : {
4647 : 2179 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
4648 : 2179 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4649 : 1726 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4650 : : return false;
4651 : :
4652 : 589 : tree rhs1 = gimple_assign_rhs1 (assign);
4653 : 589 : tree rhs2 = gimple_assign_rhs2 (assign);
4654 : 589 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4655 : : return false;
4656 : :
4657 : 435 : return !directly_supported_p (DOT_PROD_EXPR,
4658 : : SLP_TREE_VECTYPE (slp_node),
4659 : 145 : SLP_TREE_VECTYPE
4660 : : (SLP_TREE_CHILDREN (slp_node)[0]),
4661 : 145 : optab_vector_mixed_sign);
4662 : : }
4663 : :
4664 : : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4665 : : functions. Design better to avoid maintenance issues. */
4666 : :
4667 : : /* Function vect_model_reduction_cost.
4668 : :
4669 : : Models cost for a reduction operation, including the vector ops
4670 : : generated within the strip-mine loop in some cases, the initial
4671 : : definition before the loop, and the epilogue code that must be generated. */
4672 : :
4673 : : static void
4674 : 46141 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
4675 : : slp_tree node, internal_fn reduc_fn,
4676 : : vect_reduction_type reduction_type,
4677 : : int ncopies, stmt_vector_for_cost *cost_vec)
4678 : : {
4679 : 46141 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4680 : 46141 : tree vectype;
4681 : 46141 : machine_mode mode;
4682 : 46141 : class loop *loop = NULL;
4683 : :
4684 : 46141 : if (loop_vinfo)
4685 : 46141 : loop = LOOP_VINFO_LOOP (loop_vinfo);
4686 : :
4687 : : /* Condition reductions generate two reductions in the loop. */
4688 : 46141 : if (reduction_type == COND_REDUCTION)
4689 : 283 : ncopies *= 2;
4690 : :
4691 : 46141 : vectype = SLP_TREE_VECTYPE (node);
4692 : 46141 : mode = TYPE_MODE (vectype);
4693 : 46141 : stmt_vec_info orig_stmt_info
4694 : 46141 : = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
4695 : :
4696 : 46141 : gimple_match_op op;
4697 : 46141 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4698 : 0 : gcc_unreachable ();
4699 : :
4700 : 46141 : if (reduction_type == EXTRACT_LAST_REDUCTION)
4701 : : /* No extra instructions are needed in the prologue. The loop body
4702 : : operations are costed in vectorizable_condition. */
4703 : : inside_cost = 0;
4704 : 46141 : else if (reduction_type == FOLD_LEFT_REDUCTION)
4705 : : {
4706 : : /* No extra instructions needed in the prologue. */
4707 : 3927 : prologue_cost = 0;
4708 : :
4709 : 3927 : if (reduc_fn != IFN_LAST)
4710 : : /* Count one reduction-like operation per vector. */
4711 : 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4712 : : node, 0, vect_body);
4713 : : else
4714 : : {
4715 : : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4716 : 3927 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4717 : 3927 : inside_cost = record_stmt_cost (cost_vec, nelements,
4718 : : vec_to_scalar, node, 0,
4719 : : vect_body);
4720 : 3927 : inside_cost += record_stmt_cost (cost_vec, nelements,
4721 : : scalar_stmt, node, 0,
4722 : : vect_body);
4723 : : }
4724 : : }
4725 : : else
4726 : : {
4727 : : /* Add in the cost of the initial definitions. */
4728 : 42214 : int prologue_stmts;
4729 : 42214 : if (reduction_type == COND_REDUCTION)
4730 : : /* For cond reductions we have four vectors: initial index, step,
4731 : : initial result of the data reduction, initial value of the index
4732 : : reduction. */
4733 : : prologue_stmts = 4;
4734 : : else
4735 : : /* We need the initial reduction value. */
4736 : 41931 : prologue_stmts = 1;
4737 : 42214 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4738 : : scalar_to_vec, node, 0,
4739 : : vect_prologue);
4740 : : }
4741 : :
4742 : : /* Determine cost of epilogue code.
4743 : :
4744 : : We have a reduction operator that will reduce the vector in one statement.
4745 : : Also requires scalar extract. */
4746 : :
4747 : 46141 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4748 : : {
4749 : 46003 : if (reduc_fn != IFN_LAST)
4750 : : {
4751 : 34756 : if (reduction_type == COND_REDUCTION)
4752 : : {
4753 : : /* An EQ stmt and an COND_EXPR stmt. */
4754 : 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4755 : : vector_stmt, node, 0,
4756 : : vect_epilogue);
4757 : : /* Reduction of the max index and a reduction of the found
4758 : : values. */
4759 : 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4760 : : vec_to_scalar, node, 0,
4761 : : vect_epilogue);
4762 : : /* A broadcast of the max value. */
4763 : 8 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4764 : : scalar_to_vec, node, 0,
4765 : : vect_epilogue);
4766 : : }
4767 : : else
4768 : : {
4769 : 34748 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4770 : : node, 0, vect_epilogue);
4771 : 34748 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4772 : : vec_to_scalar, node, 0,
4773 : : vect_epilogue);
4774 : : }
4775 : : }
4776 : 11247 : else if (reduction_type == COND_REDUCTION)
4777 : : {
4778 : 275 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4779 : : /* Extraction of scalar elements. */
4780 : 550 : epilogue_cost += record_stmt_cost (cost_vec,
4781 : 275 : 2 * estimated_nunits,
4782 : : vec_to_scalar, node, 0,
4783 : : vect_epilogue);
4784 : : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4785 : 275 : epilogue_cost += record_stmt_cost (cost_vec,
4786 : 275 : 2 * estimated_nunits - 3,
4787 : : scalar_stmt, node, 0,
4788 : : vect_epilogue);
4789 : : }
4790 : 10972 : else if (reduction_type == EXTRACT_LAST_REDUCTION
4791 : 10972 : || reduction_type == FOLD_LEFT_REDUCTION)
4792 : : /* No extra instructions need in the epilogue. */
4793 : : ;
4794 : : else
4795 : : {
4796 : 7045 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4797 : 7045 : tree bitsize = TYPE_SIZE (op.type);
4798 : 7045 : int element_bitsize = tree_to_uhwi (bitsize);
4799 : 7045 : int nelements = vec_size_in_bits / element_bitsize;
4800 : :
4801 : 7045 : if (op.code == COND_EXPR)
4802 : 28 : op.code = MAX_EXPR;
4803 : :
4804 : : /* We have a whole vector shift available. */
4805 : 968 : if (VECTOR_MODE_P (mode)
4806 : 7045 : && directly_supported_p (op.code, vectype)
4807 : 12707 : && have_whole_vector_shift (mode))
4808 : : {
4809 : : /* Final reduction via vector shifts and the reduction operator.
4810 : : Also requires scalar extract. */
4811 : 16986 : epilogue_cost += record_stmt_cost (cost_vec,
4812 : 11324 : exact_log2 (nelements) * 2,
4813 : : vector_stmt, node, 0,
4814 : : vect_epilogue);
4815 : 5662 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4816 : : vec_to_scalar, node, 0,
4817 : : vect_epilogue);
4818 : : }
4819 : : else
4820 : : /* Use extracts and reduction op for final reduction. For N
4821 : : elements, we have N extracts and N-1 reduction ops. */
4822 : 1383 : epilogue_cost += record_stmt_cost (cost_vec,
4823 : 1383 : nelements + nelements - 1,
4824 : : vector_stmt, node, 0,
4825 : : vect_epilogue);
4826 : : }
4827 : : }
4828 : :
4829 : 46141 : if (dump_enabled_p ())
4830 : 2690 : dump_printf (MSG_NOTE,
4831 : : "vect_model_reduction_cost: inside_cost = %d, "
4832 : : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4833 : : prologue_cost, epilogue_cost);
4834 : 46141 : }
4835 : :
4836 : : /* SEQ is a sequence of instructions that initialize the reduction
4837 : : described by REDUC_INFO. Emit them in the appropriate place. */
4838 : :
4839 : : static void
4840 : 442 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4841 : : vect_reduc_info reduc_info, gimple *seq)
4842 : : {
4843 : 442 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
4844 : : {
4845 : : /* When reusing an accumulator from the main loop, we only need
4846 : : initialization instructions if the main loop can be skipped.
4847 : : In that case, emit the initialization instructions at the end
4848 : : of the guard block that does the skip. */
4849 : 25 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
4850 : 25 : gcc_assert (skip_edge);
4851 : 25 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4852 : 25 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4853 : : }
4854 : : else
4855 : : {
4856 : : /* The normal case: emit the initialization instructions on the
4857 : : preheader edge. */
4858 : 417 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4859 : 417 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4860 : : }
4861 : 442 : }
4862 : :
4863 : : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4864 : : which performs a reduction involving GROUP_SIZE scalar statements.
4865 : : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4866 : : is nonnull, introducing extra elements of that value will not change the
4867 : : result. */
4868 : :
4869 : : static void
4870 : 21561 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4871 : : vect_reduc_info reduc_info,
4872 : : tree vector_type,
4873 : : vec<tree> *vec_oprnds,
4874 : : unsigned int number_of_vectors,
4875 : : unsigned int group_size, tree neutral_op)
4876 : : {
4877 : 21561 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
4878 : 21561 : unsigned HOST_WIDE_INT nunits;
4879 : 21561 : unsigned j, number_of_places_left_in_vector;
4880 : 21561 : unsigned int i;
4881 : :
4882 : 43122 : gcc_assert (group_size == initial_values.length () || neutral_op);
4883 : :
4884 : : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4885 : : created vectors. It is greater than 1 if unrolling is performed.
4886 : :
4887 : : For example, we have two scalar operands, s1 and s2 (e.g., group of
4888 : : strided accesses of size two), while NUNITS is four (i.e., four scalars
4889 : : of this type can be packed in a vector). The output vector will contain
4890 : : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4891 : : will be 2).
4892 : :
4893 : : If GROUP_SIZE > NUNITS, the scalars will be split into several
4894 : : vectors containing the operands.
4895 : :
4896 : : For example, NUNITS is four as before, and the group size is 8
4897 : : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4898 : : {s5, s6, s7, s8}. */
4899 : :
4900 : 21561 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4901 : : nunits = group_size;
4902 : :
4903 : 21561 : tree vector_elt_type = TREE_TYPE (vector_type);
4904 : 21561 : number_of_places_left_in_vector = nunits;
4905 : 21561 : bool constant_p = true;
4906 : 21561 : tree_vector_builder elts (vector_type, nunits, 1);
4907 : 21561 : elts.quick_grow (nunits);
4908 : 21561 : gimple_seq ctor_seq = NULL;
4909 : 21561 : if (neutral_op
4910 : 43031 : && !useless_type_conversion_p (vector_elt_type,
4911 : 21470 : TREE_TYPE (neutral_op)))
4912 : 1 : neutral_op = gimple_convert (&ctor_seq, vector_elt_type, neutral_op);
4913 : 206715 : for (j = 0; j < nunits * number_of_vectors; ++j)
4914 : : {
4915 : 185154 : tree op;
4916 : 185154 : i = j % group_size;
4917 : :
4918 : : /* Get the def before the loop. In reduction chain we have only
4919 : : one initial value. Else we have as many as PHIs in the group. */
4920 : 185154 : if (i >= initial_values.length () || (j > i && neutral_op))
4921 : : op = neutral_op;
4922 : : else
4923 : : {
4924 : 44780 : if (!useless_type_conversion_p (vector_elt_type,
4925 : 22390 : TREE_TYPE (initial_values[i])))
4926 : : {
4927 : 140 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4928 : 236 : initial_values[i] = gimple_build (&ctor_seq, COND_EXPR,
4929 : : vector_elt_type,
4930 : 118 : initial_values[i],
4931 : : build_all_ones_cst
4932 : : (vector_elt_type),
4933 : : build_zero_cst
4934 : : (vector_elt_type));
4935 : : else
4936 : 44 : initial_values[i] = gimple_convert (&ctor_seq,
4937 : : vector_elt_type,
4938 : 22 : initial_values[i]);
4939 : : }
4940 : 22390 : op = initial_values[i];
4941 : : }
4942 : :
4943 : : /* Create 'vect_ = {op0,op1,...,opn}'. */
4944 : 185154 : number_of_places_left_in_vector--;
4945 : 185154 : elts[nunits - number_of_places_left_in_vector - 1] = op;
4946 : 185154 : if (!CONSTANT_CLASS_P (op))
4947 : 2347 : constant_p = false;
4948 : :
4949 : 185154 : if (number_of_places_left_in_vector == 0)
4950 : : {
4951 : 22961 : tree init;
4952 : 45922 : if (constant_p && !neutral_op
4953 : 45864 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4954 : 22961 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4955 : : /* Build the vector directly from ELTS. */
4956 : 22961 : init = gimple_build_vector (&ctor_seq, &elts);
4957 : 0 : else if (neutral_op)
4958 : : {
4959 : : /* Build a vector of the neutral value and shift the
4960 : : other elements into place. */
4961 : 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4962 : : neutral_op);
4963 : 0 : int k = nunits;
4964 : 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
4965 : : k -= 1;
4966 : 0 : while (k > 0)
4967 : : {
4968 : 0 : k -= 1;
4969 : 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4970 : 0 : vector_type, init, elts[k]);
4971 : : }
4972 : : }
4973 : : else
4974 : : {
4975 : : /* First time round, duplicate ELTS to fill the
4976 : : required number of vectors. */
4977 : 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4978 : : elts, number_of_vectors, *vec_oprnds);
4979 : 0 : break;
4980 : : }
4981 : 22961 : vec_oprnds->quick_push (init);
4982 : :
4983 : 22961 : number_of_places_left_in_vector = nunits;
4984 : 22961 : elts.new_vector (vector_type, nunits, 1);
4985 : 22961 : elts.quick_grow (nunits);
4986 : 22961 : constant_p = true;
4987 : : }
4988 : : }
4989 : 21561 : if (ctor_seq != NULL)
4990 : 442 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4991 : 21561 : }
4992 : :
4993 : : vect_reduc_info
4994 : 131230 : info_for_reduction (loop_vec_info loop_vinfo, slp_tree node)
4995 : : {
4996 : 131230 : if (node->cycle_info.id == -1)
4997 : : return NULL;
4998 : 129440 : return loop_vinfo->reduc_infos[node->cycle_info.id];
4999 : : }
5000 : :
5001 : : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5002 : : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5003 : : return false. */
5004 : :
5005 : : static bool
5006 : 21202 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5007 : : vect_reduc_info reduc_info, tree vectype)
5008 : : {
5009 : 21202 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5010 : 21202 : if (!main_loop_vinfo)
5011 : : return false;
5012 : :
5013 : 4824 : if (VECT_REDUC_INFO_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5014 : : return false;
5015 : :
5016 : 4807 : unsigned int num_phis = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).length ();
5017 : 4807 : auto_vec<tree, 16> main_loop_results (num_phis);
5018 : 4807 : auto_vec<tree, 16> initial_values (num_phis);
5019 : 4807 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5020 : : {
5021 : : /* The epilogue loop can be entered either from the main loop or
5022 : : from an earlier guard block. */
5023 : 4592 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5024 : 18392 : for (tree incoming_value : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info))
5025 : : {
5026 : : /* Look for:
5027 : :
5028 : : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5029 : : INITIAL_VALUE(guard block)>. */
5030 : 4616 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5031 : :
5032 : 4616 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5033 : 4616 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5034 : :
5035 : 4616 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5036 : 4616 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5037 : :
5038 : 4616 : main_loop_results.quick_push (from_main_loop);
5039 : 4616 : initial_values.quick_push (from_skip);
5040 : : }
5041 : : }
5042 : : else
5043 : : /* The main loop dominates the epilogue loop. */
5044 : 215 : main_loop_results.splice (VECT_REDUC_INFO_INITIAL_VALUES (reduc_info));
5045 : :
5046 : : /* See if the main loop has the kind of accumulator we need. */
5047 : 4807 : vect_reusable_accumulator *accumulator
5048 : 4807 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5049 : 4807 : if (!accumulator
5050 : 9598 : || num_phis != VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).length ()
5051 : 14401 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5052 : : VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).begin ()))
5053 : : return false;
5054 : :
5055 : : /* Handle the case where we can reduce wider vectors to narrower ones. */
5056 : 4797 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5057 : 4797 : unsigned HOST_WIDE_INT m;
5058 : 4797 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5059 : 4797 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5060 : 0 : return false;
5061 : : /* Check the intermediate vector types and operations are available. */
5062 : 4797 : tree prev_vectype = old_vectype;
5063 : 4797 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5064 : 13868 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5065 : : {
5066 : 4796 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5067 : 4796 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5068 : 4796 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5069 : 4796 : if (!intermediate_vectype
5070 : 4796 : || !directly_supported_p (VECT_REDUC_INFO_CODE (reduc_info),
5071 : : intermediate_vectype)
5072 : 9072 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5073 : 4276 : TYPE_MODE (intermediate_vectype)))
5074 : : return false;
5075 : : prev_vectype = intermediate_vectype;
5076 : : }
5077 : :
5078 : : /* Non-SLP reductions might apply an adjustment after the reduction
5079 : : operation, in order to simplify the initialization of the accumulator.
5080 : : If the epilogue loop carries on from where the main loop left off,
5081 : : it should apply the same adjustment to the final reduction result.
5082 : :
5083 : : If the epilogue loop can also be entered directly (rather than via
5084 : : the main loop), we need to be able to handle that case in the same way,
5085 : : with the same adjustment. (In principle we could add a PHI node
5086 : : to select the correct adjustment, but in practice that shouldn't be
5087 : : necessary.) */
5088 : 4275 : tree main_adjustment
5089 : 4275 : = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5090 : 4275 : if (loop_vinfo->main_loop_edge && main_adjustment)
5091 : : {
5092 : 3636 : gcc_assert (num_phis == 1);
5093 : 3636 : tree initial_value = initial_values[0];
5094 : : /* Check that we can use INITIAL_VALUE as the adjustment and
5095 : : initialize the accumulator with a neutral value instead. */
5096 : 3636 : if (!operand_equal_p (initial_value, main_adjustment))
5097 : 106 : return false;
5098 : 3530 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5099 : 3530 : initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5100 : : code, initial_value);
5101 : : }
5102 : 4169 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5103 : 4169 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).truncate (0);
5104 : 4169 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).splice (initial_values);
5105 : 4169 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info) = accumulator;
5106 : 4169 : return true;
5107 : 4807 : }
5108 : :
5109 : : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5110 : : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5111 : :
5112 : : static tree
5113 : 4210 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5114 : : gimple_seq *seq)
5115 : : {
5116 : 4210 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5117 : 4210 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5118 : 4210 : tree stype = TREE_TYPE (vectype);
5119 : 4210 : tree new_temp = vec_def;
5120 : 8419 : while (nunits > nunits1)
5121 : : {
5122 : 4209 : nunits /= 2;
5123 : 4209 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5124 : 4209 : stype, nunits);
5125 : 4209 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5126 : :
5127 : : /* The target has to make sure we support lowpart/highpart
5128 : : extraction, either via direct vector extract or through
5129 : : an integer mode punning. */
5130 : 4209 : tree dst1, dst2;
5131 : 4209 : gimple *epilog_stmt;
5132 : 4209 : if (convert_optab_handler (vec_extract_optab,
5133 : 4209 : TYPE_MODE (TREE_TYPE (new_temp)),
5134 : 4209 : TYPE_MODE (vectype1))
5135 : : != CODE_FOR_nothing)
5136 : : {
5137 : : /* Extract sub-vectors directly once vec_extract becomes
5138 : : a conversion optab. */
5139 : 2688 : dst1 = make_ssa_name (vectype1);
5140 : 2688 : epilog_stmt
5141 : 5376 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5142 : : build3 (BIT_FIELD_REF, vectype1,
5143 : 2688 : new_temp, TYPE_SIZE (vectype1),
5144 : : bitsize_int (0)));
5145 : 2688 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5146 : 2688 : dst2 = make_ssa_name (vectype1);
5147 : 2688 : epilog_stmt
5148 : 2688 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5149 : : build3 (BIT_FIELD_REF, vectype1,
5150 : 2688 : new_temp, TYPE_SIZE (vectype1),
5151 : 2688 : bitsize_int (bitsize)));
5152 : 2688 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5153 : : }
5154 : : else
5155 : : {
5156 : : /* Extract via punning to appropriately sized integer mode
5157 : : vector. */
5158 : 1521 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5159 : 1521 : tree etype = build_vector_type (eltype, 2);
5160 : 3042 : gcc_assert (convert_optab_handler (vec_extract_optab,
5161 : : TYPE_MODE (etype),
5162 : : TYPE_MODE (eltype))
5163 : : != CODE_FOR_nothing);
5164 : 1521 : tree tem = make_ssa_name (etype);
5165 : 1521 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5166 : : build1 (VIEW_CONVERT_EXPR,
5167 : : etype, new_temp));
5168 : 1521 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5169 : 1521 : new_temp = tem;
5170 : 1521 : tem = make_ssa_name (eltype);
5171 : 1521 : epilog_stmt
5172 : 3042 : = gimple_build_assign (tem, BIT_FIELD_REF,
5173 : : build3 (BIT_FIELD_REF, eltype,
5174 : 1521 : new_temp, TYPE_SIZE (eltype),
5175 : : bitsize_int (0)));
5176 : 1521 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5177 : 1521 : dst1 = make_ssa_name (vectype1);
5178 : 1521 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5179 : : build1 (VIEW_CONVERT_EXPR,
5180 : : vectype1, tem));
5181 : 1521 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5182 : 1521 : tem = make_ssa_name (eltype);
5183 : 1521 : epilog_stmt
5184 : 1521 : = gimple_build_assign (tem, BIT_FIELD_REF,
5185 : : build3 (BIT_FIELD_REF, eltype,
5186 : 1521 : new_temp, TYPE_SIZE (eltype),
5187 : 1521 : bitsize_int (bitsize)));
5188 : 1521 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5189 : 1521 : dst2 = make_ssa_name (vectype1);
5190 : 1521 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5191 : : build1 (VIEW_CONVERT_EXPR,
5192 : : vectype1, tem));
5193 : 1521 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5194 : : }
5195 : :
5196 : 4209 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5197 : : }
5198 : 4210 : if (!useless_type_conversion_p (vectype, TREE_TYPE (new_temp)))
5199 : : {
5200 : 66 : tree dst3 = make_ssa_name (vectype);
5201 : 66 : gimple *epilog_stmt = gimple_build_assign (dst3, VIEW_CONVERT_EXPR,
5202 : : build1 (VIEW_CONVERT_EXPR,
5203 : : vectype, new_temp));
5204 : 66 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5205 : 66 : new_temp = dst3;
5206 : : }
5207 : :
5208 : 4210 : return new_temp;
5209 : : }
5210 : :
5211 : : /* Function vect_create_epilog_for_reduction
5212 : :
5213 : : Create code at the loop-epilog to finalize the result of a reduction
5214 : : computation.
5215 : :
5216 : : STMT_INFO is the scalar reduction stmt that is being vectorized.
5217 : : SLP_NODE is an SLP node containing a group of reduction statements. The
5218 : : first one in this group is STMT_INFO.
5219 : : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5220 : : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5221 : : (counting from 0)
5222 : : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5223 : : exit this edge is always the main loop exit.
5224 : :
5225 : : This function:
5226 : : 1. Completes the reduction def-use cycles.
5227 : : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5228 : : by calling the function specified by REDUC_FN if available, or by
5229 : : other means (whole-vector shifts or a scalar loop).
5230 : : The function also creates a new phi node at the loop exit to preserve
5231 : : loop-closed form, as illustrated below.
5232 : :
5233 : : The flow at the entry to this function:
5234 : :
5235 : : loop:
5236 : : vec_def = phi <vec_init, null> # REDUCTION_PHI
5237 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5238 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5239 : : loop_exit:
5240 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5241 : : use <s_out0>
5242 : : use <s_out0>
5243 : :
5244 : : The above is transformed by this function into:
5245 : :
5246 : : loop:
5247 : : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5248 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5249 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5250 : : loop_exit:
5251 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5252 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5253 : : v_out2 = reduce <v_out1>
5254 : : s_out3 = extract_field <v_out2, 0>
5255 : : s_out4 = adjust_result <s_out3>
5256 : : use <s_out4>
5257 : : use <s_out4>
5258 : : */
5259 : :
5260 : : static void
5261 : 21904 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5262 : : stmt_vec_info stmt_info,
5263 : : slp_tree slp_node,
5264 : : slp_instance slp_node_instance,
5265 : : edge loop_exit)
5266 : : {
5267 : 21904 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
5268 : 21904 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5269 : 21904 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
5270 : 21904 : tree vectype;
5271 : 21904 : machine_mode mode;
5272 : 21904 : basic_block exit_bb;
5273 : 21904 : gimple *new_phi = NULL, *phi = NULL;
5274 : 21904 : gimple_stmt_iterator exit_gsi;
5275 : 21904 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5276 : 21904 : gimple *epilog_stmt = NULL;
5277 : 21904 : gimple *exit_phi;
5278 : 21904 : tree def;
5279 : 21904 : tree orig_name, scalar_result;
5280 : 21904 : imm_use_iterator imm_iter;
5281 : 21904 : use_operand_p use_p;
5282 : 21904 : gimple *use_stmt;
5283 : 21904 : auto_vec<tree> reduc_inputs;
5284 : 21904 : int j, i;
5285 : 21904 : vec<tree> &scalar_results = VECT_REDUC_INFO_SCALAR_RESULTS (reduc_info);
5286 : 21904 : unsigned int k;
5287 : : /* SLP reduction without reduction chain, e.g.,
5288 : : # a1 = phi <a2, a0>
5289 : : # b1 = phi <b2, b0>
5290 : : a2 = operation (a1)
5291 : : b2 = operation (b1) */
5292 : 21904 : const bool slp_reduc = !reduc_info->is_reduc_chain;
5293 : 21904 : tree induction_index = NULL_TREE;
5294 : :
5295 : 21904 : unsigned int group_size = SLP_TREE_LANES (slp_node);
5296 : :
5297 : 21904 : bool double_reduc = false;
5298 : 21904 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5299 : 21904 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5300 : : {
5301 : 0 : double_reduc = true;
5302 : 0 : gcc_assert (slp_reduc);
5303 : : }
5304 : :
5305 : 21904 : vectype = VECT_REDUC_INFO_VECTYPE (reduc_info);
5306 : 21904 : gcc_assert (vectype);
5307 : 21904 : mode = TYPE_MODE (vectype);
5308 : :
5309 : 21904 : tree induc_val = NULL_TREE;
5310 : 21904 : tree adjustment_def = NULL;
5311 : : /* Optimize: for induction condition reduction, if we can't use zero
5312 : : for induc_val, use initial_def. */
5313 : 21904 : if (VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5314 : 66 : induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
5315 : 21838 : else if (double_reduc)
5316 : : ;
5317 : : else
5318 : 21838 : adjustment_def = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info);
5319 : :
5320 : 21904 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
5321 : 21904 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5322 : 21904 : if (slp_reduc)
5323 : : /* All statements produce live-out values. */
5324 : 43418 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5325 : :
5326 : 21904 : unsigned vec_num
5327 : 21904 : = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5328 : :
5329 : : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5330 : : which is updated with the current index of the loop for every match of
5331 : : the original loop's cond_expr (VEC_STMT). This results in a vector
5332 : : containing the last time the condition passed for that vector lane.
5333 : : The first match will be a 1 to allow 0 to be used for non-matching
5334 : : indexes. If there are no matches at all then the vector will be all
5335 : : zeroes.
5336 : :
5337 : : PR92772: This algorithm is broken for architectures that support
5338 : : masked vectors, but do not provide fold_extract_last. */
5339 : 21904 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION)
5340 : : {
5341 : 71 : gcc_assert (!double_reduc);
5342 : 71 : auto_vec<std::pair<tree, bool>, 2> ccompares;
5343 : 71 : slp_tree cond_node = slp_node_instance->root;
5344 : 159 : while (cond_node != slp_node_instance->reduc_phis)
5345 : : {
5346 : 88 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
5347 : 88 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5348 : : {
5349 : 80 : gimple *vec_stmt
5350 : 80 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
5351 : 80 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5352 : 80 : ccompares.safe_push
5353 : 80 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
5354 : 80 : SLP_TREE_REDUC_IDX (cond_node) == 2));
5355 : : }
5356 : 88 : int slp_reduc_idx = SLP_TREE_REDUC_IDX (cond_node);
5357 : 88 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
5358 : : }
5359 : 71 : gcc_assert (ccompares.length () != 0);
5360 : :
5361 : 71 : tree indx_before_incr, indx_after_incr;
5362 : 71 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5363 : 71 : int scalar_precision
5364 : 71 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5365 : 71 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5366 : 71 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
5367 : 71 : (TYPE_MODE (vectype), cr_index_scalar_type,
5368 : : TYPE_VECTOR_SUBPARTS (vectype));
5369 : :
5370 : : /* First we create a simple vector induction variable which starts
5371 : : with the values {1,2,3,...} (SERIES_VECT) and increments by the
5372 : : vector size (STEP). */
5373 : :
5374 : : /* Create a {1,2,3,...} vector. */
5375 : 71 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5376 : :
5377 : : /* Create a vector of the step value. */
5378 : 71 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5379 : 71 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5380 : :
5381 : : /* Create an induction variable. */
5382 : 71 : gimple_stmt_iterator incr_gsi;
5383 : 71 : bool insert_after;
5384 : 71 : vect_iv_increment_position (LOOP_VINFO_IV_EXIT (loop_vinfo),
5385 : : &incr_gsi, &insert_after);
5386 : 71 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5387 : : insert_after, &indx_before_incr, &indx_after_incr);
5388 : :
5389 : : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5390 : : filled with zeros (VEC_ZERO). */
5391 : :
5392 : : /* Create a vector of 0s. */
5393 : 71 : tree zero = build_zero_cst (cr_index_scalar_type);
5394 : 71 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5395 : :
5396 : : /* Create a vector phi node. */
5397 : 71 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5398 : 71 : new_phi = create_phi_node (new_phi_tree, loop->header);
5399 : 71 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5400 : : loop_preheader_edge (loop), UNKNOWN_LOCATION);
5401 : :
5402 : : /* Now take the condition from the loops original cond_exprs
5403 : : and produce a new cond_exprs (INDEX_COND_EXPR) which for
5404 : : every match uses values from the induction variable
5405 : : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5406 : : (NEW_PHI_TREE).
5407 : : Finally, we update the phi (NEW_PHI_TREE) to take the value of
5408 : : the new cond_expr (INDEX_COND_EXPR). */
5409 : 71 : gimple_seq stmts = NULL;
5410 : 222 : for (int i = ccompares.length () - 1; i != -1; --i)
5411 : : {
5412 : 80 : tree ccompare = ccompares[i].first;
5413 : 80 : if (ccompares[i].second)
5414 : 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5415 : : cr_index_vector_type,
5416 : : ccompare,
5417 : : indx_before_incr, new_phi_tree);
5418 : : else
5419 : 11 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5420 : : cr_index_vector_type,
5421 : : ccompare,
5422 : : new_phi_tree, indx_before_incr);
5423 : : }
5424 : 71 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5425 : :
5426 : : /* Update the phi with the vec cond. */
5427 : 71 : induction_index = new_phi_tree;
5428 : 71 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5429 : : loop_latch_edge (loop), UNKNOWN_LOCATION);
5430 : 71 : }
5431 : :
5432 : : /* 2. Create epilog code.
5433 : : The reduction epilog code operates across the elements of the vector
5434 : : of partial results computed by the vectorized loop.
5435 : : The reduction epilog code consists of:
5436 : :
5437 : : step 1: compute the scalar result in a vector (v_out2)
5438 : : step 2: extract the scalar result (s_out3) from the vector (v_out2)
5439 : : step 3: adjust the scalar result (s_out3) if needed.
5440 : :
5441 : : Step 1 can be accomplished using one the following three schemes:
5442 : : (scheme 1) using reduc_fn, if available.
5443 : : (scheme 2) using whole-vector shifts, if available.
5444 : : (scheme 3) using a scalar loop. In this case steps 1+2 above are
5445 : : combined.
5446 : :
5447 : : The overall epilog code looks like this:
5448 : :
5449 : : s_out0 = phi <s_loop> # original EXIT_PHI
5450 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5451 : : v_out2 = reduce <v_out1> # step 1
5452 : : s_out3 = extract_field <v_out2, 0> # step 2
5453 : : s_out4 = adjust_result <s_out3> # step 3
5454 : :
5455 : : (step 3 is optional, and steps 1 and 2 may be combined).
5456 : : Lastly, the uses of s_out0 are replaced by s_out4. */
5457 : :
5458 : :
5459 : : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5460 : : v_out1 = phi <VECT_DEF>
5461 : : Store them in NEW_PHIS. */
5462 : : /* We need to reduce values in all exits. */
5463 : 21904 : exit_bb = loop_exit->dest;
5464 : 21904 : exit_gsi = gsi_after_labels (exit_bb);
5465 : 21904 : reduc_inputs.create (vec_num);
5466 : 45218 : for (unsigned i = 0; i < vec_num; i++)
5467 : : {
5468 : 23314 : gimple_seq stmts = NULL;
5469 : 23314 : def = vect_get_slp_vect_def (slp_node, i);
5470 : 23314 : tree new_def = copy_ssa_name (def);
5471 : 23314 : phi = create_phi_node (new_def, exit_bb);
5472 : 23314 : if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
5473 : 23287 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
5474 : : else
5475 : : {
5476 : 57 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
5477 : 30 : SET_PHI_ARG_DEF (phi, k, def);
5478 : : }
5479 : 23314 : new_def = gimple_convert (&stmts, vectype, new_def);
5480 : 23314 : reduc_inputs.quick_push (new_def);
5481 : 23314 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5482 : : }
5483 : :
5484 : : /* 2.2 Get the original scalar reduction variable as defined in the loop.
5485 : : In case STMT is a "pattern-stmt" (i.e. - it represents a reduction
5486 : : pattern), the scalar-def is taken from the original stmt that the
5487 : : pattern-stmt (STMT) replaces. */
5488 : :
5489 : 22724 : tree scalar_dest = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5490 : 21904 : tree scalar_type = TREE_TYPE (scalar_dest);
5491 : 21904 : scalar_results.truncate (0);
5492 : 21904 : scalar_results.reserve_exact (group_size);
5493 : 21904 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5494 : :
5495 : : /* True if we should implement SLP_REDUC using native reduction operations
5496 : : instead of scalar operations. */
5497 : 21904 : const bool direct_slp_reduc
5498 : 21904 : = (reduc_fn != IFN_LAST
5499 : 21904 : && slp_reduc
5500 : 21904 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5501 : :
5502 : : /* If signed overflow is undefined we might need to perform reduction
5503 : : computations in an unsigned type. */
5504 : 21904 : tree compute_vectype = vectype;
5505 : 21904 : if (ANY_INTEGRAL_TYPE_P (vectype)
5506 : 14901 : && TYPE_OVERFLOW_UNDEFINED (vectype)
5507 : 5422 : && code.is_tree_code ()
5508 : 27326 : && arith_code_with_undefined_signed_overflow ((tree_code) code))
5509 : 3973 : compute_vectype = unsigned_type_for (vectype);
5510 : :
5511 : : /* In case of reduction chain, e.g.,
5512 : : # a1 = phi <a3, a0>
5513 : : a2 = operation (a1)
5514 : : a3 = operation (a2),
5515 : :
5516 : : we may end up with more than one vector result. Here we reduce them
5517 : : to one vector.
5518 : :
5519 : : The same is true for a SLP reduction, e.g.,
5520 : : # a1 = phi <a2, a0>
5521 : : # b1 = phi <b2, b0>
5522 : : a2 = operation (a1)
5523 : : b2 = operation (a2),
5524 : :
5525 : : where we can end up with more than one vector as well. We can
5526 : : easily accumulate vectors when the number of vector elements is
5527 : : a multiple of the SLP group size.
5528 : :
5529 : : The same is true if we couldn't use a single defuse cycle. */
5530 : 21904 : if ((!slp_reduc
5531 : : || direct_slp_reduc
5532 : : || (slp_reduc
5533 : 21904 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
5534 : 43808 : && reduc_inputs.length () > 1)
5535 : : {
5536 : 539 : gimple_seq stmts = NULL;
5537 : 539 : tree single_input = reduc_inputs[0];
5538 : 539 : if (compute_vectype != vectype)
5539 : 154 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5540 : : compute_vectype, single_input);
5541 : 1843 : for (k = 1; k < reduc_inputs.length (); k++)
5542 : : {
5543 : 1304 : tree input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5544 : 1304 : compute_vectype, reduc_inputs[k]);
5545 : 1304 : single_input = gimple_build (&stmts, code, compute_vectype,
5546 : : single_input, input);
5547 : : }
5548 : 539 : if (compute_vectype != vectype)
5549 : 154 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5550 : : vectype, single_input);
5551 : 539 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5552 : :
5553 : 539 : reduc_inputs.truncate (0);
5554 : 539 : reduc_inputs.safe_push (single_input);
5555 : : }
5556 : :
5557 : 21904 : tree orig_reduc_input = reduc_inputs[0];
5558 : :
5559 : : /* If this loop is an epilogue loop that can be skipped after the
5560 : : main loop, we can only share a reduction operation between the
5561 : : main loop and the epilogue if we put it at the target of the
5562 : : skip edge.
5563 : :
5564 : : We can still reuse accumulators if this check fails. Doing so has
5565 : : the minor(?) benefit of making the epilogue loop's scalar result
5566 : : independent of the main loop's scalar result. */
5567 : 21904 : bool unify_with_main_loop_p = false;
5568 : 21904 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
5569 : 4169 : && loop_vinfo->skip_this_loop_edge
5570 : 3937 : && single_succ_p (exit_bb)
5571 : 21925 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5572 : : {
5573 : 21 : unify_with_main_loop_p = true;
5574 : :
5575 : 21 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5576 : 21 : reduc_inputs[0] = make_ssa_name (vectype);
5577 : 21 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5578 : 21 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5579 : : UNKNOWN_LOCATION);
5580 : 21 : add_phi_arg (new_phi,
5581 : 21 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)->reduc_input,
5582 : : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5583 : 21 : exit_gsi = gsi_after_labels (reduc_block);
5584 : : }
5585 : :
5586 : : /* Shouldn't be used beyond this point. */
5587 : 21904 : exit_bb = nullptr;
5588 : :
5589 : : /* If we are operating on a mask vector and do not support direct mask
5590 : : reduction, work on a bool data vector instead of a mask vector. */
5591 : 21904 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5592 : 221 : && VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info)
5593 : 22096 : && vectype != VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info))
5594 : : {
5595 : 192 : compute_vectype = vectype = VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info);
5596 : 192 : gimple_seq stmts = NULL;
5597 : 392 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5598 : 400 : reduc_inputs[i] = gimple_build (&stmts, VEC_COND_EXPR, vectype,
5599 : 200 : reduc_inputs[i],
5600 : : build_one_cst (vectype),
5601 : : build_zero_cst (vectype));
5602 : 192 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5603 : : }
5604 : :
5605 : 21904 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5606 : 71 : && reduc_fn != IFN_LAST)
5607 : : {
5608 : : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5609 : : various data values where the condition matched and another vector
5610 : : (INDUCTION_INDEX) containing all the indexes of those matches. We
5611 : : need to extract the last matching index (which will be the index with
5612 : : highest value) and use this to index into the data vector.
5613 : : For the case where there were no matches, the data vector will contain
5614 : : all default values and the index vector will be all zeros. */
5615 : :
5616 : : /* Get various versions of the type of the vector of indexes. */
5617 : 4 : tree index_vec_type = TREE_TYPE (induction_index);
5618 : 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5619 : 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
5620 : 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
5621 : :
5622 : : /* Get an unsigned integer version of the type of the data vector. */
5623 : 4 : int scalar_precision
5624 : 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5625 : 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5626 : 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5627 : : vectype);
5628 : :
5629 : : /* First we need to create a vector (ZERO_VEC) of zeros and another
5630 : : vector (MAX_INDEX_VEC) filled with the last matching index, which we
5631 : : can create using a MAX reduction and then expanding.
5632 : : In the case where the loop never made any matches, the max index will
5633 : : be zero. */
5634 : :
5635 : : /* Vector of {0, 0, 0,...}. */
5636 : 4 : tree zero_vec = build_zero_cst (vectype);
5637 : :
5638 : : /* Find maximum value from the vector of found indexes. */
5639 : 4 : tree max_index = make_ssa_name (index_scalar_type);
5640 : 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5641 : : 1, induction_index);
5642 : 4 : gimple_call_set_lhs (max_index_stmt, max_index);
5643 : 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5644 : :
5645 : : /* Vector of {max_index, max_index, max_index,...}. */
5646 : 4 : tree max_index_vec = make_ssa_name (index_vec_type);
5647 : 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5648 : : max_index);
5649 : 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5650 : : max_index_vec_rhs);
5651 : 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5652 : :
5653 : : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5654 : : with the vector (INDUCTION_INDEX) of found indexes, choosing values
5655 : : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5656 : : otherwise. Only one value should match, resulting in a vector
5657 : : (VEC_COND) with one data value and the rest zeros.
5658 : : In the case where the loop never made any matches, every index will
5659 : : match, resulting in a vector with all data values (which will all be
5660 : : the default value). */
5661 : :
5662 : : /* Compare the max index vector to the vector of found indexes to find
5663 : : the position of the max value. */
5664 : 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
5665 : 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5666 : : induction_index,
5667 : : max_index_vec);
5668 : 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5669 : :
5670 : : /* Use the compare to choose either values from the data vector or
5671 : : zero. */
5672 : 4 : tree vec_cond = make_ssa_name (vectype);
5673 : 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5674 : : vec_compare,
5675 : 4 : reduc_inputs[0],
5676 : : zero_vec);
5677 : 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5678 : :
5679 : : /* Finally we need to extract the data value from the vector (VEC_COND)
5680 : : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5681 : : reduction, but because this doesn't exist, we can use a MAX reduction
5682 : : instead. The data value might be signed or a float so we need to cast
5683 : : it first.
5684 : : In the case where the loop never made any matches, the data values are
5685 : : all identical, and so will reduce down correctly. */
5686 : :
5687 : : /* Make the matched data values unsigned. */
5688 : 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5689 : 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5690 : : vec_cond);
5691 : 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5692 : : VIEW_CONVERT_EXPR,
5693 : : vec_cond_cast_rhs);
5694 : 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5695 : :
5696 : : /* Reduce down to a scalar value. */
5697 : 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
5698 : 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5699 : : 1, vec_cond_cast);
5700 : 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5701 : 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5702 : :
5703 : : /* Convert the reduced value back to the result type and set as the
5704 : : result. */
5705 : 4 : gimple_seq stmts = NULL;
5706 : 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5707 : : data_reduc);
5708 : 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5709 : 4 : scalar_results.safe_push (new_temp);
5710 : 4 : }
5711 : 21900 : else if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5712 : 67 : && reduc_fn == IFN_LAST)
5713 : : {
5714 : : /* Condition reduction without supported IFN_REDUC_MAX. Generate
5715 : : idx = 0;
5716 : : idx_val = induction_index[0];
5717 : : val = data_reduc[0];
5718 : : for (idx = 0, val = init, i = 0; i < nelts; ++i)
5719 : : if (induction_index[i] > idx_val)
5720 : : val = data_reduc[i], idx_val = induction_index[i];
5721 : : return val; */
5722 : :
5723 : 67 : tree data_eltype = TREE_TYPE (vectype);
5724 : 67 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5725 : 67 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5726 : 67 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5727 : : /* Enforced by vectorizable_reduction, which ensures we have target
5728 : : support before allowing a conditional reduction on variable-length
5729 : : vectors. */
5730 : 67 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5731 : 67 : tree idx_val = NULL_TREE, val = NULL_TREE;
5732 : 447 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5733 : : {
5734 : 380 : tree old_idx_val = idx_val;
5735 : 380 : tree old_val = val;
5736 : 380 : idx_val = make_ssa_name (idx_eltype);
5737 : 380 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5738 : : build3 (BIT_FIELD_REF, idx_eltype,
5739 : : induction_index,
5740 : 380 : bitsize_int (el_size),
5741 : 380 : bitsize_int (off)));
5742 : 380 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5743 : 380 : val = make_ssa_name (data_eltype);
5744 : 760 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5745 : : build3 (BIT_FIELD_REF,
5746 : : data_eltype,
5747 : 380 : reduc_inputs[0],
5748 : 380 : bitsize_int (el_size),
5749 : 380 : bitsize_int (off)));
5750 : 380 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5751 : 380 : if (off != 0)
5752 : : {
5753 : 313 : tree new_idx_val = idx_val;
5754 : 313 : if (off != v_size - el_size)
5755 : : {
5756 : 246 : new_idx_val = make_ssa_name (idx_eltype);
5757 : 246 : epilog_stmt = gimple_build_assign (new_idx_val,
5758 : : MAX_EXPR, idx_val,
5759 : : old_idx_val);
5760 : 246 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5761 : : }
5762 : 313 : tree cond = make_ssa_name (boolean_type_node);
5763 : 313 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5764 : : idx_val, old_idx_val);
5765 : 313 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5766 : 313 : tree new_val = make_ssa_name (data_eltype);
5767 : 313 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5768 : : cond, val, old_val);
5769 : 313 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5770 : 313 : idx_val = new_idx_val;
5771 : 313 : val = new_val;
5772 : : }
5773 : : }
5774 : : /* Convert the reduced value back to the result type and set as the
5775 : : result. */
5776 : 67 : gimple_seq stmts = NULL;
5777 : 67 : val = gimple_convert (&stmts, scalar_type, val);
5778 : 67 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5779 : 67 : scalar_results.safe_push (val);
5780 : 67 : }
5781 : :
5782 : : /* 2.3 Create the reduction code, using one of the three schemes described
5783 : : above. In SLP we simply need to extract all the elements from the
5784 : : vector (without reducing them), so we use scalar shifts. */
5785 : 21833 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
5786 : : {
5787 : 19993 : tree tmp;
5788 : 19993 : tree vec_elem_type;
5789 : :
5790 : : /* Case 1: Create:
5791 : : v_out2 = reduc_expr <v_out1> */
5792 : :
5793 : 19993 : if (dump_enabled_p ())
5794 : 1412 : dump_printf_loc (MSG_NOTE, vect_location,
5795 : : "Reduce using direct vector reduction.\n");
5796 : :
5797 : 19993 : gimple_seq stmts = NULL;
5798 : 19993 : vec_elem_type = TREE_TYPE (vectype);
5799 : 19993 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5800 : 19993 : vec_elem_type, reduc_inputs[0]);
5801 : 19993 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5802 : 19993 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5803 : :
5804 : 19993 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5805 : 66 : && induc_val)
5806 : : {
5807 : : /* Earlier we set the initial value to be a vector if induc_val
5808 : : values. Check the result and if it is induc_val then replace
5809 : : with the original initial value, unless induc_val is
5810 : : the same as initial_def already. */
5811 : 63 : tree zcompare = make_ssa_name (boolean_type_node);
5812 : 63 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5813 : : new_temp, induc_val);
5814 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5815 : 63 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
5816 : 63 : tmp = make_ssa_name (new_scalar_dest);
5817 : 63 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5818 : : initial_def, new_temp);
5819 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5820 : 63 : new_temp = tmp;
5821 : : }
5822 : :
5823 : 19993 : scalar_results.safe_push (new_temp);
5824 : 19993 : }
5825 : 1654 : else if (direct_slp_reduc)
5826 : : {
5827 : : /* Here we create one vector for each of the GROUP_SIZE results,
5828 : : with the elements for other SLP statements replaced with the
5829 : : neutral value. We can then do a normal reduction on each vector. */
5830 : :
5831 : : /* Enforced by vectorizable_reduction. */
5832 : : gcc_assert (reduc_inputs.length () == 1);
5833 : : gcc_assert (pow2p_hwi (group_size));
5834 : :
5835 : : gimple_seq seq = NULL;
5836 : :
5837 : : /* Build a vector {0, 1, 2, ...}, with the same number of elements
5838 : : and the same element size as VECTYPE. */
5839 : : tree index = build_index_vector (vectype, 0, 1);
5840 : : tree index_type = TREE_TYPE (index);
5841 : : tree index_elt_type = TREE_TYPE (index_type);
5842 : : tree mask_type = truth_type_for (index_type);
5843 : :
5844 : : /* Create a vector that, for each element, identifies which of
5845 : : the results should use it. */
5846 : : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5847 : : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5848 : : build_vector_from_val (index_type, index_mask));
5849 : :
5850 : : /* Get a neutral vector value. This is simply a splat of the neutral
5851 : : scalar value if we have one, otherwise the initial scalar value
5852 : : is itself a neutral value. */
5853 : : tree vector_identity = NULL_TREE;
5854 : : tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5855 : : NULL_TREE, false);
5856 : : if (neutral_op)
5857 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5858 : : neutral_op);
5859 : : for (unsigned int i = 0; i < group_size; ++i)
5860 : : {
5861 : : /* If there's no univeral neutral value, we can use the
5862 : : initial scalar value from the original PHI. This is used
5863 : : for MIN and MAX reduction, for example. */
5864 : : if (!neutral_op)
5865 : : {
5866 : : tree scalar_value
5867 : : = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[i];
5868 : : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5869 : : scalar_value);
5870 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5871 : : scalar_value);
5872 : : }
5873 : :
5874 : : /* Calculate the equivalent of:
5875 : :
5876 : : sel[j] = (index[j] == i);
5877 : :
5878 : : which selects the elements of REDUC_INPUTS[0] that should
5879 : : be included in the result. */
5880 : : tree compare_val = build_int_cst (index_elt_type, i);
5881 : : compare_val = build_vector_from_val (index_type, compare_val);
5882 : : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5883 : : index, compare_val);
5884 : :
5885 : : /* Calculate the equivalent of:
5886 : :
5887 : : vec = seq ? reduc_inputs[0] : vector_identity;
5888 : :
5889 : : VEC is now suitable for a full vector reduction. */
5890 : : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5891 : : sel, reduc_inputs[0], vector_identity);
5892 : :
5893 : : /* Do the reduction and convert it to the appropriate type. */
5894 : : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5895 : : TREE_TYPE (vectype), vec);
5896 : : scalar = gimple_convert (&seq, scalar_type, scalar);
5897 : : scalar_results.safe_push (scalar);
5898 : : }
5899 : : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5900 : : }
5901 : : else
5902 : : {
5903 : 1654 : bool reduce_with_shift;
5904 : 1654 : tree vec_temp;
5905 : :
5906 : 1654 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5907 : :
5908 : : /* See if the target wants to do the final (shift) reduction
5909 : : in a vector mode of smaller size and first reduce upper/lower
5910 : : halves against each other. */
5911 : 1840 : enum machine_mode mode1 = mode;
5912 : 1840 : tree stype = TREE_TYPE (vectype);
5913 : 1840 : if (compute_vectype != vectype)
5914 : : {
5915 : 457 : stype = unsigned_type_for (stype);
5916 : 457 : gimple_seq stmts = NULL;
5917 : 940 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5918 : : {
5919 : 483 : tree new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5920 : 483 : compute_vectype, reduc_inputs[i]);
5921 : 483 : reduc_inputs[i] = new_temp;
5922 : : }
5923 : 457 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5924 : : }
5925 : 1840 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5926 : 1840 : unsigned nunits1 = nunits;
5927 : 1840 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5928 : 1840 : && reduc_inputs.length () == 1)
5929 : : {
5930 : 43 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5931 : : /* For SLP reductions we have to make sure lanes match up, but
5932 : : since we're doing individual element final reduction reducing
5933 : : vector width here is even more important.
5934 : : ??? We can also separate lanes with permutes, for the common
5935 : : case of power-of-two group-size odd/even extracts would work. */
5936 : 43 : if (slp_reduc && nunits != nunits1)
5937 : : {
5938 : 43 : nunits1 = least_common_multiple (nunits1, group_size);
5939 : 86 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5940 : : }
5941 : : }
5942 : 1797 : else if (!slp_reduc
5943 : 1797 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5944 : 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5945 : :
5946 : 1840 : tree vectype1 = vectype;
5947 : 1840 : if (mode1 != mode)
5948 : : {
5949 : 44 : vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5950 : 44 : stype, nunits1);
5951 : : /* First reduce the vector to the desired vector size we should
5952 : : do shift reduction on by combining upper and lower halves. */
5953 : 44 : gimple_seq stmts = NULL;
5954 : 44 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5955 : : code, &stmts);
5956 : 44 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5957 : 44 : reduc_inputs[0] = new_temp;
5958 : : }
5959 : :
5960 : 1840 : reduce_with_shift = have_whole_vector_shift (mode1);
5961 : 733 : if (!VECTOR_MODE_P (mode1)
5962 : 2571 : || !directly_supported_p (code, vectype1))
5963 : : reduce_with_shift = false;
5964 : :
5965 : 1823 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
5966 : : {
5967 : 1600 : tree bitsize = TYPE_SIZE (TREE_TYPE (vectype1));
5968 : 1600 : int element_bitsize = tree_to_uhwi (bitsize);
5969 : : /* Enforced by vectorizable_reduction, which disallows SLP reductions
5970 : : for variable-length vectors and also requires direct target support
5971 : : for loop reductions. */
5972 : 1600 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5973 : 1600 : int nelements = vec_size_in_bits / element_bitsize;
5974 : 1600 : vec_perm_builder sel;
5975 : 1600 : vec_perm_indices indices;
5976 : :
5977 : 1600 : int elt_offset;
5978 : :
5979 : 1600 : tree zero_vec = build_zero_cst (vectype1);
5980 : : /* Case 2: Create:
5981 : : for (offset = nelements/2; offset >= 1; offset/=2)
5982 : : {
5983 : : Create: va' = vec_shift <va, offset>
5984 : : Create: va = vop <va, va'>
5985 : : } */
5986 : :
5987 : 1600 : if (dump_enabled_p ())
5988 : 348 : dump_printf_loc (MSG_NOTE, vect_location,
5989 : : "Reduce using vector shifts\n");
5990 : :
5991 : 1600 : gimple_seq stmts = NULL;
5992 : 1600 : new_temp = gimple_convert (&stmts, vectype1, reduc_inputs[0]);
5993 : 1600 : for (elt_offset = nelements / 2;
5994 : 3491 : elt_offset >= 1;
5995 : 1891 : elt_offset /= 2)
5996 : : {
5997 : 1891 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5998 : 1891 : indices.new_vector (sel, 2, nelements);
5999 : 1891 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6000 : 1891 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6001 : : new_temp, zero_vec, mask);
6002 : 1891 : new_temp = gimple_build (&stmts, code,
6003 : : vectype1, new_name, new_temp);
6004 : : }
6005 : :
6006 : : /* 2.4 Extract the final scalar result. Create:
6007 : : s_out3 = extract_field <v_out2, bitpos> */
6008 : :
6009 : 1600 : if (dump_enabled_p ())
6010 : 348 : dump_printf_loc (MSG_NOTE, vect_location,
6011 : : "extract scalar result\n");
6012 : :
6013 : 1600 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
6014 : 1600 : new_temp, bitsize, bitsize_zero_node);
6015 : 1600 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6016 : 1600 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6017 : 1600 : scalar_results.safe_push (new_temp);
6018 : 1600 : }
6019 : : else
6020 : : {
6021 : : /* Case 3: Create:
6022 : : s = extract_field <v_out2, 0>
6023 : : for (offset = element_size;
6024 : : offset < vector_size;
6025 : : offset += element_size;)
6026 : : {
6027 : : Create: s' = extract_field <v_out2, offset>
6028 : : Create: s = op <s, s'> // For non SLP cases
6029 : : } */
6030 : :
6031 : 240 : if (dump_enabled_p ())
6032 : 143 : dump_printf_loc (MSG_NOTE, vect_location,
6033 : : "Reduce using scalar code.\n");
6034 : :
6035 : 240 : tree compute_type = TREE_TYPE (vectype1);
6036 : 240 : unsigned vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6037 : 240 : unsigned element_bitsize = vector_element_bits (vectype1);
6038 : 240 : tree bitsize = bitsize_int (element_bitsize);
6039 : 240 : gimple_seq stmts = NULL;
6040 : 586 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6041 : : {
6042 : 346 : unsigned bit_offset;
6043 : 692 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6044 : 346 : vec_temp, bitsize, bitsize_zero_node);
6045 : :
6046 : : /* In SLP we don't need to apply reduction operation, so we just
6047 : : collect s' values in SCALAR_RESULTS. */
6048 : 346 : if (slp_reduc)
6049 : 336 : scalar_results.safe_push (new_temp);
6050 : :
6051 : 712 : for (bit_offset = element_bitsize;
6052 : 1058 : bit_offset < vec_size_in_bits;
6053 : 712 : bit_offset += element_bitsize)
6054 : : {
6055 : 712 : tree bitpos = bitsize_int (bit_offset);
6056 : 712 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6057 : : compute_type, vec_temp,
6058 : : bitsize, bitpos);
6059 : 712 : if (slp_reduc)
6060 : : {
6061 : : /* In SLP we don't need to apply reduction operation, so
6062 : : we just collect s' values in SCALAR_RESULTS. */
6063 : 702 : new_temp = new_name;
6064 : 702 : scalar_results.safe_push (new_name);
6065 : : }
6066 : : else
6067 : 10 : new_temp = gimple_build (&stmts, code, compute_type,
6068 : : new_name, new_temp);
6069 : : }
6070 : : }
6071 : :
6072 : : /* The only case where we need to reduce scalar results in a SLP
6073 : : reduction, is unrolling. If the size of SCALAR_RESULTS is
6074 : : greater than GROUP_SIZE, we reduce them combining elements modulo
6075 : : GROUP_SIZE. */
6076 : 240 : if (slp_reduc)
6077 : : {
6078 : 230 : tree res, first_res, new_res;
6079 : :
6080 : : /* Reduce multiple scalar results in case of SLP unrolling. */
6081 : 585 : for (j = group_size; scalar_results.iterate (j, &res);
6082 : : j++)
6083 : : {
6084 : 355 : first_res = scalar_results[j % group_size];
6085 : 355 : new_res = gimple_build (&stmts, code, compute_type,
6086 : : first_res, res);
6087 : 355 : scalar_results[j % group_size] = new_res;
6088 : : }
6089 : 230 : scalar_results.truncate (group_size);
6090 : 1143 : for (k = 0; k < group_size; k++)
6091 : 1366 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6092 : 683 : scalar_results[k]);
6093 : : }
6094 : : else
6095 : : {
6096 : : /* Reduction chain - we have one scalar to keep in
6097 : : SCALAR_RESULTS. */
6098 : 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6099 : 10 : scalar_results.safe_push (new_temp);
6100 : : }
6101 : :
6102 : 240 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6103 : : }
6104 : :
6105 : 1840 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6106 : 0 : && induc_val)
6107 : : {
6108 : : /* Earlier we set the initial value to be a vector if induc_val
6109 : : values. Check the result and if it is induc_val then replace
6110 : : with the original initial value, unless induc_val is
6111 : : the same as initial_def already. */
6112 : 0 : tree zcompare = make_ssa_name (boolean_type_node);
6113 : 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6114 : 0 : scalar_results[0], induc_val);
6115 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6116 : 0 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
6117 : 0 : tree tmp = make_ssa_name (new_scalar_dest);
6118 : 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6119 : 0 : initial_def, scalar_results[0]);
6120 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6121 : 0 : scalar_results[0] = tmp;
6122 : : }
6123 : : }
6124 : :
6125 : : /* 2.5 Adjust the final result by the initial value of the reduction
6126 : : variable. (When such adjustment is not needed, then
6127 : : 'adjustment_def' is zero). For example, if code is PLUS we create:
6128 : : new_temp = loop_exit_def + adjustment_def */
6129 : :
6130 : 21904 : if (adjustment_def)
6131 : : {
6132 : 15823 : gcc_assert (!slp_reduc || group_size == 1);
6133 : 15823 : gimple_seq stmts = NULL;
6134 : 15823 : if (double_reduc)
6135 : : {
6136 : 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6137 : 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6138 : 0 : new_temp = gimple_build (&stmts, code, vectype,
6139 : 0 : reduc_inputs[0], adjustment_def);
6140 : : }
6141 : : else
6142 : : {
6143 : 15823 : new_temp = scalar_results[0];
6144 : 15823 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6145 : 15823 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6146 : : adjustment_def);
6147 : 15823 : new_temp = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6148 : : new_temp);
6149 : 15823 : new_temp = gimple_build (&stmts, code, TREE_TYPE (compute_vectype),
6150 : : new_temp, adjustment_def);
6151 : 15823 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6152 : : }
6153 : :
6154 : 15823 : epilog_stmt = gimple_seq_last_stmt (stmts);
6155 : 15823 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6156 : 15823 : scalar_results[0] = new_temp;
6157 : : }
6158 : :
6159 : : /* Record this operation if it could be reused by the epilogue loop. */
6160 : 21904 : if (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION
6161 : 21904 : && reduc_inputs.length () == 1)
6162 : 21717 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6163 : : { orig_reduc_input, reduc_info });
6164 : :
6165 : : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6166 : : phis with new adjusted scalar results, i.e., replace use <s_out0>
6167 : : with use <s_out4>.
6168 : :
6169 : : Transform:
6170 : : loop_exit:
6171 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6172 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6173 : : v_out2 = reduce <v_out1>
6174 : : s_out3 = extract_field <v_out2, 0>
6175 : : s_out4 = adjust_result <s_out3>
6176 : : use <s_out0>
6177 : : use <s_out0>
6178 : :
6179 : : into:
6180 : :
6181 : : loop_exit:
6182 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6183 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6184 : : v_out2 = reduce <v_out1>
6185 : : s_out3 = extract_field <v_out2, 0>
6186 : : s_out4 = adjust_result <s_out3>
6187 : : use <s_out4>
6188 : : use <s_out4> */
6189 : :
6190 : 43808 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6191 : 21904 : auto_vec<gimple *> phis;
6192 : 44261 : for (k = 0; k < live_out_stmts.size (); k++)
6193 : : {
6194 : 22357 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6195 : 22357 : tree scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6196 : :
6197 : : /* Find the loop-closed-use at the loop exit of the original scalar
6198 : : result. (The reduction result is expected to have two immediate uses,
6199 : : one at the latch block, and one at the loop exit). Note with
6200 : : early break we can have two exit blocks, so pick the correct PHI. */
6201 : 113692 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6202 : 68978 : if (!is_gimple_debug (USE_STMT (use_p))
6203 : 68978 : && !flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6204 : : {
6205 : 22352 : gcc_assert (is_a <gphi *> (USE_STMT (use_p)));
6206 : 22352 : if (gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6207 : 22344 : phis.safe_push (USE_STMT (use_p));
6208 : 22357 : }
6209 : :
6210 : 44701 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6211 : : {
6212 : : /* Replace the uses: */
6213 : 22344 : orig_name = PHI_RESULT (exit_phi);
6214 : :
6215 : : /* Look for a single use at the target of the skip edge. */
6216 : 22344 : if (unify_with_main_loop_p)
6217 : : {
6218 : 38 : use_operand_p use_p;
6219 : 38 : gimple *user;
6220 : 38 : if (!single_imm_use (orig_name, &use_p, &user))
6221 : 0 : gcc_unreachable ();
6222 : 38 : orig_name = gimple_get_lhs (user);
6223 : : }
6224 : :
6225 : 22344 : scalar_result = scalar_results[k];
6226 : 82935 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6227 : : {
6228 : 114785 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6229 : 38269 : SET_USE (use_p, scalar_result);
6230 : 38247 : update_stmt (use_stmt);
6231 : 22344 : }
6232 : : }
6233 : :
6234 : 22357 : phis.truncate (0);
6235 : : }
6236 : 21904 : }
6237 : :
6238 : : /* Return a vector of type VECTYPE that is equal to the vector select
6239 : : operation "MASK ? VEC : IDENTITY". Insert the select statements
6240 : : before GSI. */
6241 : :
6242 : : static tree
6243 : 9 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6244 : : tree vec, tree identity)
6245 : : {
6246 : 9 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6247 : 9 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6248 : : mask, vec, identity);
6249 : 9 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6250 : 9 : return cond;
6251 : : }
6252 : :
6253 : : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6254 : : order, starting with LHS. Insert the extraction statements before GSI and
6255 : : associate the new scalar SSA names with variable SCALAR_DEST.
6256 : : If MASK is nonzero mask the input and then operate on it unconditionally.
6257 : : Return the SSA name for the result. */
6258 : :
6259 : : static tree
6260 : 1043 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6261 : : tree_code code, tree lhs, tree vector_rhs,
6262 : : tree mask)
6263 : : {
6264 : 1043 : tree vectype = TREE_TYPE (vector_rhs);
6265 : 1043 : tree scalar_type = TREE_TYPE (vectype);
6266 : 1043 : tree bitsize = TYPE_SIZE (scalar_type);
6267 : 1043 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6268 : 1043 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6269 : :
6270 : : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6271 : : to perform an unconditional element-wise reduction of it. */
6272 : 1043 : if (mask)
6273 : : {
6274 : 45 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6275 : : "masked_vector_rhs");
6276 : 45 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6277 : : false);
6278 : 45 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
6279 : 45 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6280 : : mask, vector_rhs, vector_identity);
6281 : 45 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6282 : 45 : vector_rhs = masked_vector_rhs;
6283 : : }
6284 : :
6285 : 1043 : for (unsigned HOST_WIDE_INT bit_offset = 0;
6286 : 4647 : bit_offset < vec_size_in_bits;
6287 : 3604 : bit_offset += element_bitsize)
6288 : : {
6289 : 3604 : tree bitpos = bitsize_int (bit_offset);
6290 : 3604 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6291 : : bitsize, bitpos);
6292 : :
6293 : 3604 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6294 : 3604 : rhs = make_ssa_name (scalar_dest, stmt);
6295 : 3604 : gimple_assign_set_lhs (stmt, rhs);
6296 : 3604 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6297 : : /* Fold the vector extract, combining it with a previous reversal
6298 : : like seen in PR90579. */
6299 : 3604 : auto gsi2 = gsi_for_stmt (stmt);
6300 : 3604 : if (fold_stmt (&gsi2, follow_all_ssa_edges))
6301 : 356 : update_stmt (gsi_stmt (gsi2));
6302 : :
6303 : 3604 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6304 : 3604 : tree new_name = make_ssa_name (scalar_dest, stmt);
6305 : 3604 : gimple_assign_set_lhs (stmt, new_name);
6306 : 3604 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6307 : 3604 : lhs = new_name;
6308 : : }
6309 : 1043 : return lhs;
6310 : : }
6311 : :
6312 : : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6313 : : type of the vector input. */
6314 : :
6315 : : static internal_fn
6316 : 2520 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6317 : : {
6318 : 2520 : internal_fn mask_reduc_fn;
6319 : 2520 : internal_fn mask_len_reduc_fn;
6320 : :
6321 : 2520 : switch (reduc_fn)
6322 : : {
6323 : 0 : case IFN_FOLD_LEFT_PLUS:
6324 : 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6325 : 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6326 : 0 : break;
6327 : :
6328 : : default:
6329 : : return IFN_LAST;
6330 : : }
6331 : :
6332 : 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6333 : : OPTIMIZE_FOR_SPEED))
6334 : : return mask_reduc_fn;
6335 : 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6336 : : OPTIMIZE_FOR_SPEED))
6337 : : return mask_len_reduc_fn;
6338 : : return IFN_LAST;
6339 : : }
6340 : :
6341 : : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6342 : : statement that sets the live-out value. REDUC_DEF_STMT is the phi
6343 : : statement. CODE is the operation performed by STMT_INFO and OPS are
6344 : : its scalar operands. REDUC_INDEX is the index of the operand in
6345 : : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6346 : : implements in-order reduction, or IFN_LAST if we should open-code it.
6347 : : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6348 : : that should be used to control the operation in a fully-masked loop. */
6349 : :
6350 : : static bool
6351 : 830 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6352 : : stmt_vec_info stmt_info,
6353 : : gimple_stmt_iterator *gsi,
6354 : : slp_tree slp_node,
6355 : : code_helper code, internal_fn reduc_fn,
6356 : : int num_ops, tree vectype_in,
6357 : : int reduc_index, vec_loop_masks *masks,
6358 : : vec_loop_lens *lens)
6359 : : {
6360 : 830 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6361 : 830 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
6362 : 830 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6363 : :
6364 : 830 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6365 : :
6366 : 830 : bool is_cond_op = false;
6367 : 830 : if (!code.is_tree_code ())
6368 : : {
6369 : 15 : code = conditional_internal_fn_code (internal_fn (code));
6370 : 15 : gcc_assert (code != ERROR_MARK);
6371 : : is_cond_op = true;
6372 : : }
6373 : :
6374 : 830 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
6375 : :
6376 : 830 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6377 : : TYPE_VECTOR_SUBPARTS (vectype_in)));
6378 : :
6379 : : /* ??? We should, when transforming the cycle PHI, record the existing
6380 : : scalar def as vector def so looking up the vector def works. This
6381 : : would also allow generalizing this for reduction paths of length > 1
6382 : : and/or SLP reductions. */
6383 : 830 : slp_tree reduc_node = SLP_TREE_CHILDREN (slp_node)[reduc_index];
6384 : 830 : stmt_vec_info reduc_var_def = SLP_TREE_SCALAR_STMTS (reduc_node)[0];
6385 : 830 : tree reduc_var = gimple_get_lhs (STMT_VINFO_STMT (reduc_var_def));
6386 : :
6387 : : /* The operands either come from a binary operation or an IFN_COND operation.
6388 : : The former is a gimple assign with binary rhs and the latter is a
6389 : : gimple call with four arguments. */
6390 : 830 : gcc_assert (num_ops == 2 || num_ops == 4);
6391 : :
6392 : 830 : auto_vec<tree> vec_oprnds0, vec_opmask;
6393 : 830 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
6394 : 830 : + (1 - reduc_index)],
6395 : : &vec_oprnds0);
6396 : : /* For an IFN_COND_OP we also need the vector mask operand. */
6397 : 830 : if (is_cond_op)
6398 : 15 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
6399 : :
6400 : : /* The transform below relies on preserving the original scalar PHI
6401 : : and its latch def which we replace. So work backwards from there. */
6402 : 830 : tree scalar_dest
6403 : 830 : = gimple_phi_arg_def_from_edge (as_a <gphi *> (STMT_VINFO_STMT
6404 : : (reduc_var_def)),
6405 : 830 : loop_latch_edge (loop));
6406 : 830 : stmt_vec_info scalar_dest_def_info
6407 : 830 : = vect_stmt_to_vectorize (loop_vinfo->lookup_def (scalar_dest));
6408 : 830 : tree scalar_type = TREE_TYPE (scalar_dest);
6409 : :
6410 : 830 : int vec_num = vec_oprnds0.length ();
6411 : 830 : tree vec_elem_type = TREE_TYPE (vectype_out);
6412 : 830 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6413 : :
6414 : 830 : tree vector_identity = NULL_TREE;
6415 : 830 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6416 : : {
6417 : 2 : vector_identity = build_zero_cst (vectype_out);
6418 : 2 : if (!HONOR_SIGNED_ZEROS (vectype_out))
6419 : : ;
6420 : : else
6421 : : {
6422 : 2 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6423 : 2 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6424 : : vector_identity);
6425 : : }
6426 : : }
6427 : :
6428 : 830 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6429 : 830 : int i;
6430 : 830 : tree def0;
6431 : 1873 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6432 : : {
6433 : 1043 : gimple *new_stmt;
6434 : 1043 : tree mask = NULL_TREE;
6435 : 1043 : tree len = NULL_TREE;
6436 : 1043 : tree bias = NULL_TREE;
6437 : 1043 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6438 : : {
6439 : 9 : tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
6440 : : vec_num, vectype_in, i);
6441 : 9 : if (is_cond_op)
6442 : 9 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
6443 : 9 : loop_mask, vec_opmask[i], gsi);
6444 : : else
6445 : : mask = loop_mask;
6446 : : }
6447 : 1034 : else if (is_cond_op)
6448 : 36 : mask = vec_opmask[i];
6449 : 1043 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6450 : : {
6451 : 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6452 : : i, 1);
6453 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6454 : 0 : bias = build_int_cst (intQI_type_node, biasval);
6455 : 0 : if (!is_cond_op)
6456 : 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
6457 : : }
6458 : :
6459 : : /* Handle MINUS by adding the negative. */
6460 : 1043 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6461 : : {
6462 : 0 : tree negated = make_ssa_name (vectype_out);
6463 : 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6464 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6465 : 0 : def0 = negated;
6466 : : }
6467 : :
6468 : 9 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
6469 : 1052 : && mask && mask_reduc_fn == IFN_LAST)
6470 : 9 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6471 : : vector_identity);
6472 : :
6473 : : /* On the first iteration the input is simply the scalar phi
6474 : : result, and for subsequent iterations it is the output of
6475 : : the preceding operation. */
6476 : 1043 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6477 : : {
6478 : 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6479 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6480 : : def0, mask, len, bias);
6481 : 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6482 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6483 : : def0, mask);
6484 : : else
6485 : 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6486 : : def0);
6487 : : /* For chained SLP reductions the output of the previous reduction
6488 : : operation serves as the input of the next. For the final statement
6489 : : the output cannot be a temporary - we reuse the original
6490 : : scalar destination of the last statement. */
6491 : 0 : if (i != vec_num - 1)
6492 : : {
6493 : 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
6494 : 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6495 : 0 : gimple_set_lhs (new_stmt, reduc_var);
6496 : : }
6497 : : }
6498 : : else
6499 : : {
6500 : 1043 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
6501 : : tree_code (code), reduc_var, def0,
6502 : : mask);
6503 : 1043 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6504 : : /* Remove the statement, so that we can use the same code paths
6505 : : as for statements that we've just created. */
6506 : 1043 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6507 : 1043 : gsi_remove (&tmp_gsi, true);
6508 : : }
6509 : :
6510 : 1043 : if (i == vec_num - 1)
6511 : : {
6512 : 830 : gimple_set_lhs (new_stmt, scalar_dest);
6513 : 830 : vect_finish_replace_stmt (loop_vinfo,
6514 : : scalar_dest_def_info,
6515 : : new_stmt);
6516 : : }
6517 : : else
6518 : 213 : vect_finish_stmt_generation (loop_vinfo,
6519 : : scalar_dest_def_info,
6520 : : new_stmt, gsi);
6521 : :
6522 : 1043 : slp_node->push_vec_def (new_stmt);
6523 : : }
6524 : :
6525 : 830 : return true;
6526 : 830 : }
6527 : :
6528 : : /* Function is_nonwrapping_integer_induction.
6529 : :
6530 : : Check if STMT_VINO (which is part of loop LOOP) both increments and
6531 : : does not cause overflow. */
6532 : :
6533 : : static bool
6534 : 411 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6535 : : {
6536 : 411 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6537 : 411 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6538 : 411 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6539 : 411 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6540 : 411 : widest_int ni, max_loop_value, lhs_max;
6541 : 411 : wi::overflow_type overflow = wi::OVF_NONE;
6542 : :
6543 : : /* Make sure the loop is integer based. */
6544 : 411 : if (TREE_CODE (base) != INTEGER_CST
6545 : 112 : || TREE_CODE (step) != INTEGER_CST)
6546 : : return false;
6547 : :
6548 : : /* Check that the max size of the loop will not wrap. */
6549 : :
6550 : 112 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6551 : : return true;
6552 : :
6553 : 8 : if (! max_stmt_executions (loop, &ni))
6554 : : return false;
6555 : :
6556 : 8 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6557 : 8 : &overflow);
6558 : 8 : if (overflow)
6559 : : return false;
6560 : :
6561 : 8 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6562 : 16 : TYPE_SIGN (lhs_type), &overflow);
6563 : 8 : if (overflow)
6564 : : return false;
6565 : :
6566 : 8 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6567 : 8 : <= TYPE_PRECISION (lhs_type));
6568 : 411 : }
6569 : :
6570 : : /* Check if masking can be supported by inserting a conditional expression.
6571 : : CODE is the code for the operation. COND_FN is the conditional internal
6572 : : function, if it exists. VECTYPE_IN is the type of the vector input. */
6573 : : static bool
6574 : 5264 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6575 : : tree vectype_in)
6576 : : {
6577 : 5264 : if (cond_fn != IFN_LAST
6578 : 5264 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
6579 : : OPTIMIZE_FOR_SPEED))
6580 : : return false;
6581 : :
6582 : 3770 : if (code.is_tree_code ())
6583 : 3768 : switch (tree_code (code))
6584 : : {
6585 : : case DOT_PROD_EXPR:
6586 : : case SAD_EXPR:
6587 : : return true;
6588 : :
6589 : : default:
6590 : : break;
6591 : : }
6592 : : return false;
6593 : : }
6594 : :
6595 : : /* Insert a conditional expression to enable masked vectorization. CODE is the
6596 : : code for the operation. VOP is the array of operands. MASK is the loop
6597 : : mask. GSI is a statement iterator used to place the new conditional
6598 : : expression. */
6599 : : static void
6600 : 4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6601 : : gimple_stmt_iterator *gsi)
6602 : : {
6603 : 4 : switch (tree_code (code))
6604 : : {
6605 : 4 : case DOT_PROD_EXPR:
6606 : 4 : {
6607 : 4 : tree vectype = TREE_TYPE (vop[1]);
6608 : 4 : tree zero = build_zero_cst (vectype);
6609 : 4 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6610 : 4 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6611 : : mask, vop[1], zero);
6612 : 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6613 : 4 : vop[1] = masked_op1;
6614 : 4 : break;
6615 : : }
6616 : :
6617 : 0 : case SAD_EXPR:
6618 : 0 : {
6619 : 0 : tree vectype = TREE_TYPE (vop[1]);
6620 : 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6621 : 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6622 : : mask, vop[1], vop[0]);
6623 : 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6624 : 0 : vop[1] = masked_op1;
6625 : 0 : break;
6626 : : }
6627 : :
6628 : 0 : default:
6629 : 0 : gcc_unreachable ();
6630 : : }
6631 : 4 : }
6632 : :
6633 : : /* Given an operation with CODE in loop reduction path whose reduction PHI is
6634 : : specified by REDUC_INFO, the operation has TYPE of scalar result, and its
6635 : : input vectype is represented by VECTYPE_IN. The vectype of vectorized result
6636 : : may be different from VECTYPE_IN, either in base type or vectype lanes,
6637 : : lane-reducing operation is the case. This function check if it is possible,
6638 : : and how to perform partial vectorization on the operation in the context
6639 : : of LOOP_VINFO. */
6640 : :
6641 : : static void
6642 : 3392 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
6643 : : vect_reduc_info reduc_info,
6644 : : slp_tree slp_node,
6645 : : code_helper code, tree type,
6646 : : tree vectype_in)
6647 : : {
6648 : 3392 : enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
6649 : 3392 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
6650 : 3392 : internal_fn cond_fn
6651 : 920 : = ((code.is_internal_fn ()
6652 : 920 : && internal_fn_mask_index ((internal_fn)code) != -1)
6653 : 3392 : ? (internal_fn)code : get_conditional_internal_fn (code, type));
6654 : :
6655 : 3392 : if (reduc_type != FOLD_LEFT_REDUCTION
6656 : 2717 : && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6657 : 6066 : && (cond_fn == IFN_LAST
6658 : 2674 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6659 : : OPTIMIZE_FOR_SPEED)))
6660 : : {
6661 : 1702 : if (dump_enabled_p ())
6662 : 97 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6663 : : "can't operate on partial vectors because"
6664 : : " no conditional operation is available.\n");
6665 : 1702 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6666 : : }
6667 : 1690 : else if (reduc_type == FOLD_LEFT_REDUCTION
6668 : 1690 : && reduc_fn == IFN_LAST
6669 : 1690 : && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
6670 : : {
6671 : 0 : if (dump_enabled_p ())
6672 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6673 : : "can't operate on partial vectors because"
6674 : : " no conditional operation is available.\n");
6675 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6676 : : }
6677 : 1690 : else if (reduc_type == FOLD_LEFT_REDUCTION
6678 : 675 : && internal_fn_mask_index (reduc_fn) == -1
6679 : 675 : && FLOAT_TYPE_P (vectype_in)
6680 : 2360 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
6681 : : {
6682 : 0 : if (dump_enabled_p ())
6683 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6684 : : "can't operate on partial vectors because"
6685 : : " signed zeros cannot be preserved.\n");
6686 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6687 : : }
6688 : : else
6689 : : {
6690 : 1690 : internal_fn mask_reduc_fn
6691 : 1690 : = get_masked_reduction_fn (reduc_fn, vectype_in);
6692 : 1690 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6693 : 1690 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
6694 : 1690 : unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node);
6695 : :
6696 : 1690 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6697 : 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
6698 : : else
6699 : 1690 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
6700 : : }
6701 : 3392 : }
6702 : :
6703 : : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
6704 : : the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
6705 : : and the analysis is for slp if SLP_NODE is not NULL.
6706 : :
6707 : : For a lane-reducing operation, the loop reduction path that it lies in,
6708 : : may contain normal operation, or other lane-reducing operation of different
6709 : : input type size, an example as:
6710 : :
6711 : : int sum = 0;
6712 : : for (i)
6713 : : {
6714 : : ...
6715 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
6716 : : sum += w[i]; // widen-sum <vector(16) char>
6717 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
6718 : : sum += n[i]; // normal <vector(4) int>
6719 : : ...
6720 : : }
6721 : :
6722 : : Vectorization factor is essentially determined by operation whose input
6723 : : vectype has the most lanes ("vector(16) char" in the example), while we
6724 : : need to choose input vectype with the least lanes ("vector(4) int" in the
6725 : : example) to determine effective number of vector reduction PHIs. */
6726 : :
6727 : : bool
6728 : 306377 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
6729 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
6730 : : {
6731 : 306377 : gimple *stmt = stmt_info->stmt;
6732 : :
6733 : 306377 : if (!lane_reducing_stmt_p (stmt))
6734 : : return false;
6735 : :
6736 : 462 : tree type = TREE_TYPE (gimple_assign_lhs (stmt));
6737 : :
6738 : 462 : if (!INTEGRAL_TYPE_P (type))
6739 : : return false;
6740 : :
6741 : : /* Do not try to vectorize bit-precision reductions. */
6742 : 462 : if (!type_has_mode_precision_p (type))
6743 : : return false;
6744 : :
6745 : 462 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6746 : :
6747 : : /* TODO: Support lane-reducing operation that does not directly participate
6748 : : in loop reduction. */
6749 : 462 : if (!reduc_info)
6750 : : return false;
6751 : :
6752 : : /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
6753 : : recoginized. */
6754 : 462 : gcc_assert (!nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo), stmt_info));
6755 : 462 : gcc_assert (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION);
6756 : :
6757 : 1848 : for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
6758 : : {
6759 : 1386 : slp_tree slp_op;
6760 : 1386 : tree op;
6761 : 1386 : tree vectype;
6762 : 1386 : enum vect_def_type dt;
6763 : :
6764 : 1386 : if (!vect_is_simple_use (loop_vinfo, slp_node, i, &op,
6765 : : &slp_op, &dt, &vectype))
6766 : : {
6767 : 0 : if (dump_enabled_p ())
6768 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769 : : "use not simple.\n");
6770 : 0 : return false;
6771 : : }
6772 : :
6773 : 1386 : if (!vectype)
6774 : : {
6775 : 6 : vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
6776 : : slp_op);
6777 : 6 : if (!vectype)
6778 : : return false;
6779 : : }
6780 : :
6781 : 1386 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
6782 : : {
6783 : 0 : if (dump_enabled_p ())
6784 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6785 : : "incompatible vector types for invariants\n");
6786 : 0 : return false;
6787 : : }
6788 : :
6789 : 1386 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6790 : 462 : continue;
6791 : :
6792 : : /* There should be at most one cycle def in the stmt. */
6793 : 924 : if (VECTORIZABLE_CYCLE_DEF (dt))
6794 : : return false;
6795 : : }
6796 : :
6797 : 462 : slp_tree node_in = SLP_TREE_CHILDREN (slp_node)[0];
6798 : 462 : tree vectype_in = SLP_TREE_VECTYPE (node_in);
6799 : 462 : gcc_assert (vectype_in);
6800 : :
6801 : : /* Compute number of effective vector statements for costing. */
6802 : 462 : unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, node_in);
6803 : 462 : gcc_assert (ncopies_for_cost >= 1);
6804 : :
6805 : 462 : if (vect_is_emulated_mixed_dot_prod (slp_node))
6806 : : {
6807 : : /* We need extra two invariants: one that contains the minimum signed
6808 : : value and one that contains half of its negative. */
6809 : 11 : int prologue_stmts = 2;
6810 : 11 : unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
6811 : : scalar_to_vec, slp_node, 0,
6812 : : vect_prologue);
6813 : 11 : if (dump_enabled_p ())
6814 : 0 : dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
6815 : : "extra prologue_cost = %d .\n", cost);
6816 : :
6817 : : /* Three dot-products and a subtraction. */
6818 : 11 : ncopies_for_cost *= 4;
6819 : : }
6820 : :
6821 : 462 : record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, slp_node,
6822 : : 0, vect_body);
6823 : :
6824 : 462 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
6825 : : {
6826 : 43 : enum tree_code code = gimple_assign_rhs_code (stmt);
6827 : 43 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
6828 : 43 : node_in, code, type,
6829 : : vectype_in);
6830 : : }
6831 : :
6832 : : /* Transform via vect_transform_reduction. */
6833 : 462 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6834 : 462 : return true;
6835 : : }
6836 : :
6837 : : /* Function vectorizable_reduction.
6838 : :
6839 : : Check if STMT_INFO performs a reduction operation that can be vectorized.
6840 : : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6841 : : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6842 : : Return true if STMT_INFO is vectorizable in this way.
6843 : :
6844 : : This function also handles reduction idioms (patterns) that have been
6845 : : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6846 : : may be of this form:
6847 : : X = pattern_expr (arg0, arg1, ..., X)
6848 : : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6849 : : sequence that had been detected and replaced by the pattern-stmt
6850 : : (STMT_INFO).
6851 : :
6852 : : This function also handles reduction of condition expressions, for example:
6853 : : for (int i = 0; i < N; i++)
6854 : : if (a[i] < value)
6855 : : last = a[i];
6856 : : This is handled by vectorising the loop and creating an additional vector
6857 : : containing the loop indexes for which "a[i] < value" was true. In the
6858 : : function epilogue this is reduced to a single max value and then used to
6859 : : index into the vector of results.
6860 : :
6861 : : In some cases of reduction patterns, the type of the reduction variable X is
6862 : : different than the type of the other arguments of STMT_INFO.
6863 : : In such cases, the vectype that is used when transforming STMT_INFO into
6864 : : a vector stmt is different than the vectype that is used to determine the
6865 : : vectorization factor, because it consists of a different number of elements
6866 : : than the actual number of elements that are being operated upon in parallel.
6867 : :
6868 : : For example, consider an accumulation of shorts into an int accumulator.
6869 : : On some targets it's possible to vectorize this pattern operating on 8
6870 : : shorts at a time (hence, the vectype for purposes of determining the
6871 : : vectorization factor should be V8HI); on the other hand, the vectype that
6872 : : is used to create the vector form is actually V4SI (the type of the result).
6873 : :
6874 : : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6875 : : indicates what is the actual level of parallelism (V8HI in the example), so
6876 : : that the right vectorization factor would be derived. This vectype
6877 : : corresponds to the type of arguments to the reduction stmt, and should *NOT*
6878 : : be used to create the vectorized stmt. The right vectype for the vectorized
6879 : : stmt is obtained from the type of the result X:
6880 : : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6881 : :
6882 : : This means that, contrary to "regular" reductions (or "regular" stmts in
6883 : : general), the following equation:
6884 : : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6885 : : does *NOT* necessarily hold for reduction patterns. */
6886 : :
6887 : : bool
6888 : 305915 : vectorizable_reduction (loop_vec_info loop_vinfo,
6889 : : stmt_vec_info stmt_info, slp_tree slp_node,
6890 : : slp_instance slp_node_instance,
6891 : : stmt_vector_for_cost *cost_vec)
6892 : : {
6893 : 305915 : tree vectype_in = NULL_TREE;
6894 : 305915 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6895 : 305915 : stmt_vec_info cond_stmt_vinfo = NULL;
6896 : 305915 : int i;
6897 : 305915 : int ncopies;
6898 : 305915 : bool single_defuse_cycle = false;
6899 : 305915 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6900 : 305915 : tree cond_reduc_val = NULL_TREE;
6901 : :
6902 : : /* Make sure it was already recognized as a reduction computation. */
6903 : 305915 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6904 : : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6905 : 305915 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6906 : : return false;
6907 : :
6908 : : /* The reduction meta. */
6909 : 56972 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6910 : :
6911 : 56972 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6912 : : {
6913 : 1339 : gcc_assert (is_a <gphi *> (stmt_info->stmt));
6914 : : /* We eventually need to set a vector type on invariant arguments. */
6915 : : unsigned j;
6916 : : slp_tree child;
6917 : 4017 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6918 : 2678 : if (!vect_maybe_update_slp_op_vectype (child,
6919 : : SLP_TREE_VECTYPE (slp_node)))
6920 : : {
6921 : 0 : if (dump_enabled_p ())
6922 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6923 : : "incompatible vector types for "
6924 : : "invariants\n");
6925 : 0 : return false;
6926 : : }
6927 : : /* Analysis for double-reduction is done on the outer
6928 : : loop PHI, nested cycles have no further restrictions. */
6929 : 1339 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
6930 : 1339 : return true;
6931 : : }
6932 : :
6933 : 55633 : if (!is_a <gphi *> (stmt_info->stmt))
6934 : : {
6935 : 7014 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def);
6936 : 7014 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6937 : 7014 : return true;
6938 : : }
6939 : :
6940 : 48619 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6941 : 48619 : stmt_vec_info phi_info = stmt_info;
6942 : 48619 : bool double_reduc = false;
6943 : 48619 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6944 : : {
6945 : : /* We arrive here for both the inner loop LC PHI and the
6946 : : outer loop PHI. The latter is what we want to analyze the
6947 : : reduction with. The LC PHI is handled by vectorizable_lc_phi. */
6948 : 266 : if (gimple_bb (stmt_info->stmt) != loop->header)
6949 : 0 : return false;
6950 : :
6951 : : /* Set loop and phi_info to the inner loop. */
6952 : 266 : use_operand_p use_p;
6953 : 266 : gimple *use_stmt;
6954 : 266 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6955 : : &use_p, &use_stmt);
6956 : 266 : gcc_assert (res);
6957 : 266 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
6958 : 266 : loop = loop->inner;
6959 : 266 : double_reduc = true;
6960 : : }
6961 : :
6962 : 48619 : const bool reduc_chain = reduc_info->is_reduc_chain;
6963 : 48619 : slp_node_instance->reduc_phis = slp_node;
6964 : : /* ??? We're leaving slp_node to point to the PHIs, we only
6965 : : need it to get at the number of vector stmts which wasn't
6966 : : yet initialized for the instance root. */
6967 : :
6968 : : /* PHIs should not participate in patterns. */
6969 : 48619 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6970 : 48619 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6971 : :
6972 : : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6973 : : and compute the reduction chain length. Discover the real
6974 : : reduction operation stmt on the way (slp_for_stmt_info). */
6975 : 48619 : unsigned reduc_chain_length = 0;
6976 : 48619 : stmt_info = NULL;
6977 : 48619 : slp_tree slp_for_stmt_info = NULL;
6978 : 48619 : slp_tree vdef_slp = slp_node_instance->root;
6979 : 107008 : while (vdef_slp != slp_node)
6980 : : {
6981 : 59141 : int reduc_idx = SLP_TREE_REDUC_IDX (vdef_slp);
6982 : 59141 : if (reduc_idx == -1)
6983 : : {
6984 : 744 : if (dump_enabled_p ())
6985 : 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6986 : : "reduction chain broken by patterns.\n");
6987 : 752 : return false;
6988 : : }
6989 : 58397 : stmt_vec_info vdef = SLP_TREE_REPRESENTATIVE (vdef_slp);
6990 : 58397 : if (is_a <gphi *> (vdef->stmt))
6991 : : {
6992 : 532 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
6993 : : /* Do not count PHIs towards the chain length. */
6994 : 532 : continue;
6995 : : }
6996 : 57865 : gimple_match_op op;
6997 : 57865 : if (!gimple_extract_op (vdef->stmt, &op))
6998 : : {
6999 : 0 : if (dump_enabled_p ())
7000 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7001 : : "reduction chain includes unsupported"
7002 : : " statement type.\n");
7003 : 0 : return false;
7004 : : }
7005 : 57865 : if (CONVERT_EXPR_CODE_P (op.code))
7006 : : {
7007 : 3288 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7008 : : {
7009 : 8 : if (dump_enabled_p ())
7010 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7011 : : "conversion in the reduction chain.\n");
7012 : 8 : return false;
7013 : : }
7014 : 3280 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[0];
7015 : : }
7016 : : else
7017 : : {
7018 : : /* First non-conversion stmt. */
7019 : 54577 : if (!slp_for_stmt_info)
7020 : 47867 : slp_for_stmt_info = vdef_slp;
7021 : :
7022 : 54577 : if (lane_reducing_op_p (op.code))
7023 : : {
7024 : : /* The last operand of lane-reducing operation is for
7025 : : reduction. */
7026 : 462 : gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7027 : :
7028 : 462 : slp_tree op_node = SLP_TREE_CHILDREN (vdef_slp)[0];
7029 : 462 : tree vectype_op = SLP_TREE_VECTYPE (op_node);
7030 : 462 : tree type_op = TREE_TYPE (op.ops[0]);
7031 : 462 : if (!vectype_op)
7032 : : {
7033 : 9 : vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7034 : : type_op);
7035 : 9 : if (!vectype_op
7036 : 9 : || !vect_maybe_update_slp_op_vectype (op_node,
7037 : : vectype_op))
7038 : 0 : return false;
7039 : : }
7040 : :
7041 : : /* To accommodate lane-reducing operations of mixed input
7042 : : vectypes, choose input vectype with the least lanes for the
7043 : : reduction PHI statement, which would result in the most
7044 : : ncopies for vectorized reduction results. */
7045 : 462 : if (!vectype_in
7046 : 462 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7047 : 46 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7048 : 439 : vectype_in = vectype_op;
7049 : : }
7050 : 54115 : else if (!vectype_in)
7051 : 47428 : vectype_in = SLP_TREE_VECTYPE (slp_node);
7052 : 54577 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7053 : : }
7054 : 57857 : reduc_chain_length++;
7055 : : }
7056 : 47867 : stmt_info = SLP_TREE_REPRESENTATIVE (slp_for_stmt_info);
7057 : :
7058 : : /* PHIs should not participate in patterns. */
7059 : 47867 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7060 : :
7061 : : /* 1. Is vectorizable reduction? */
7062 : : /* Not supportable if the reduction variable is used in the loop, unless
7063 : : it's a reduction chain. */
7064 : 47867 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7065 : 0 : && !reduc_chain)
7066 : : return false;
7067 : :
7068 : : /* Reductions that are not used even in an enclosing outer-loop,
7069 : : are expected to be "live" (used out of the loop). */
7070 : 47867 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7071 : 0 : && !STMT_VINFO_LIVE_P (stmt_info))
7072 : : return false;
7073 : :
7074 : : /* 2. Has this been recognized as a reduction pattern?
7075 : :
7076 : : Check if STMT represents a pattern that has been recognized
7077 : : in earlier analysis stages. For stmts that represent a pattern,
7078 : : the STMT_VINFO_RELATED_STMT field records the last stmt in
7079 : : the original sequence that constitutes the pattern. */
7080 : :
7081 : 47867 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7082 : 47867 : if (orig_stmt_info)
7083 : : {
7084 : 3253 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7085 : 3253 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7086 : : }
7087 : :
7088 : : /* 3. Check the operands of the operation. The first operands are defined
7089 : : inside the loop body. The last operand is the reduction variable,
7090 : : which is defined by the loop-header-phi. */
7091 : :
7092 : 47867 : tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
7093 : 47867 : VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
7094 : :
7095 : 47867 : gimple_match_op op;
7096 : 47867 : if (!gimple_extract_op (stmt_info->stmt, &op))
7097 : 0 : gcc_unreachable ();
7098 : 47867 : bool lane_reducing = lane_reducing_op_p (op.code);
7099 : :
7100 : 47867 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7101 : 15140 : && !SCALAR_FLOAT_TYPE_P (op.type))
7102 : : return false;
7103 : :
7104 : : /* Do not try to vectorize bit-precision reductions. */
7105 : 47867 : if (!type_has_mode_precision_p (op.type)
7106 : 1471 : && op.code != BIT_AND_EXPR
7107 : 1411 : && op.code != BIT_IOR_EXPR
7108 : 48302 : && op.code != BIT_XOR_EXPR)
7109 : : return false;
7110 : :
7111 : : /* Lane-reducing ops also never can be used in a SLP reduction group
7112 : : since we'll mix lanes belonging to different reductions. But it's
7113 : : OK to use them in a reduction chain or when the reduction group
7114 : : has just one element. */
7115 : 47557 : if (lane_reducing
7116 : 47557 : && !reduc_chain
7117 : 412 : && SLP_TREE_LANES (slp_node) > 1)
7118 : : {
7119 : 0 : if (dump_enabled_p ())
7120 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7121 : : "lane-reducing reduction in reduction group.\n");
7122 : 0 : return false;
7123 : : }
7124 : :
7125 : : /* All uses but the last are expected to be defined in the loop.
7126 : : The last use is the reduction variable. In case of nested cycle this
7127 : : assumption is not true: we use reduc_index to record the index of the
7128 : : reduction variable. */
7129 : 47557 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7130 : 47557 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7131 : 47557 : gcc_assert (op.code != COND_EXPR || !COMPARISON_CLASS_P (op.ops[0]));
7132 : 151125 : for (i = 0; i < (int) op.num_ops; i++)
7133 : : {
7134 : : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7135 : 103568 : if (i == 0 && op.code == COND_EXPR)
7136 : 51960 : continue;
7137 : :
7138 : 102769 : stmt_vec_info def_stmt_info;
7139 : 102769 : enum vect_def_type dt;
7140 : 102769 : if (!vect_is_simple_use (loop_vinfo, slp_for_stmt_info,
7141 : : i, &op.ops[i], &slp_op[i], &dt,
7142 : 102769 : &vectype_op[i], &def_stmt_info))
7143 : : {
7144 : 0 : if (dump_enabled_p ())
7145 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7146 : : "use not simple.\n");
7147 : 0 : return false;
7148 : : }
7149 : :
7150 : : /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7151 : : reduction operand twice (once as definition, once as else). */
7152 : 102769 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]
7153 : 205538 : == SLP_TREE_CHILDREN
7154 : 102769 : (slp_for_stmt_info)[SLP_TREE_REDUC_IDX (slp_for_stmt_info)])
7155 : 51161 : continue;
7156 : :
7157 : : /* There should be only one cycle def in the stmt, the one
7158 : : leading to reduc_def. */
7159 : 51608 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]->cycle_info.id != -1)
7160 : : return false;
7161 : :
7162 : 51608 : if (!vectype_op[i])
7163 : 4431 : vectype_op[i]
7164 : 4431 : = get_vectype_for_scalar_type (loop_vinfo,
7165 : 4431 : TREE_TYPE (op.ops[i]), slp_op[i]);
7166 : :
7167 : : /* Record how the non-reduction-def value of COND_EXPR is defined.
7168 : : ??? For a chain of multiple CONDs we'd have to match them up all. */
7169 : 51608 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7170 : : {
7171 : 772 : if (dt == vect_constant_def)
7172 : : {
7173 : 95 : cond_reduc_dt = dt;
7174 : 95 : cond_reduc_val = op.ops[i];
7175 : : }
7176 : 677 : else if (dt == vect_induction_def
7177 : 411 : && def_stmt_info
7178 : 1088 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7179 : : {
7180 : 112 : cond_reduc_dt = dt;
7181 : 112 : cond_stmt_vinfo = def_stmt_info;
7182 : : }
7183 : : }
7184 : : }
7185 : :
7186 : 47557 : enum vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7187 : : /* If we have a condition reduction, see if we can simplify it further. */
7188 : 47557 : if (reduction_type == COND_REDUCTION)
7189 : : {
7190 : 787 : if (SLP_TREE_LANES (slp_node) != 1)
7191 : : return false;
7192 : :
7193 : : /* When the condition uses the reduction value in the condition, fail. */
7194 : 763 : if (SLP_TREE_REDUC_IDX (slp_node) == 0)
7195 : : {
7196 : 0 : if (dump_enabled_p ())
7197 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7198 : : "condition depends on previous iteration\n");
7199 : 0 : return false;
7200 : : }
7201 : :
7202 : 763 : if (reduc_chain_length == 1
7203 : 763 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7204 : : OPTIMIZE_FOR_SPEED)
7205 : 736 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7206 : : vectype_in,
7207 : : OPTIMIZE_FOR_SPEED)))
7208 : : {
7209 : 0 : if (dump_enabled_p ())
7210 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7211 : : "optimizing condition reduction with"
7212 : : " FOLD_EXTRACT_LAST.\n");
7213 : 0 : VECT_REDUC_INFO_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7214 : : }
7215 : 763 : else if (cond_reduc_dt == vect_induction_def)
7216 : : {
7217 : 112 : tree base
7218 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7219 : 112 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7220 : :
7221 : 112 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7222 : : && TREE_CODE (step) == INTEGER_CST);
7223 : 112 : cond_reduc_val = NULL_TREE;
7224 : 112 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7225 : 112 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7226 : 112 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7227 : : ;
7228 : : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7229 : : above base; punt if base is the minimum value of the type for
7230 : : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7231 : 100 : else if (tree_int_cst_sgn (step) == -1)
7232 : : {
7233 : 20 : cond_reduc_op_code = MIN_EXPR;
7234 : 20 : if (tree_int_cst_sgn (base) == -1)
7235 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7236 : 20 : else if (tree_int_cst_lt (base,
7237 : 20 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7238 : 20 : cond_reduc_val
7239 : 20 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7240 : : }
7241 : : else
7242 : : {
7243 : 80 : cond_reduc_op_code = MAX_EXPR;
7244 : 80 : if (tree_int_cst_sgn (base) == 1)
7245 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7246 : 80 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7247 : : base))
7248 : 80 : cond_reduc_val
7249 : 80 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7250 : : }
7251 : 100 : if (cond_reduc_val)
7252 : : {
7253 : 100 : if (dump_enabled_p ())
7254 : 64 : dump_printf_loc (MSG_NOTE, vect_location,
7255 : : "condition expression based on "
7256 : : "integer induction.\n");
7257 : 100 : VECT_REDUC_INFO_CODE (reduc_info) = cond_reduc_op_code;
7258 : 100 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info)
7259 : 100 : = cond_reduc_val;
7260 : 100 : VECT_REDUC_INFO_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7261 : : }
7262 : : }
7263 : 651 : else if (cond_reduc_dt == vect_constant_def)
7264 : : {
7265 : 85 : enum vect_def_type cond_initial_dt;
7266 : 85 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7267 : 85 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7268 : 85 : if (cond_initial_dt == vect_constant_def
7269 : 107 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7270 : 22 : TREE_TYPE (cond_reduc_val)))
7271 : : {
7272 : 22 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7273 : : cond_initial_val, cond_reduc_val);
7274 : 22 : if (e && (integer_onep (e) || integer_zerop (e)))
7275 : : {
7276 : 22 : if (dump_enabled_p ())
7277 : 16 : dump_printf_loc (MSG_NOTE, vect_location,
7278 : : "condition expression based on "
7279 : : "compile time constant.\n");
7280 : : /* Record reduction code at analysis stage. */
7281 : 22 : VECT_REDUC_INFO_CODE (reduc_info)
7282 : 22 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7283 : 22 : VECT_REDUC_INFO_TYPE (reduc_info) = CONST_COND_REDUCTION;
7284 : : }
7285 : : }
7286 : : }
7287 : : }
7288 : :
7289 : 47533 : if (STMT_VINFO_LIVE_P (phi_info))
7290 : : return false;
7291 : :
7292 : 47533 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
7293 : :
7294 : 47533 : gcc_assert (ncopies >= 1);
7295 : :
7296 : 47533 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7297 : :
7298 : : /* 4.2. Check support for the epilog operation.
7299 : :
7300 : : If STMT represents a reduction pattern, then the type of the
7301 : : reduction variable may be different than the type of the rest
7302 : : of the arguments. For example, consider the case of accumulation
7303 : : of shorts into an int accumulator; The original code:
7304 : : S1: int_a = (int) short_a;
7305 : : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7306 : :
7307 : : was replaced with:
7308 : : STMT: int_acc = widen_sum <short_a, int_acc>
7309 : :
7310 : : This means that:
7311 : : 1. The tree-code that is used to create the vector operation in the
7312 : : epilog code (that reduces the partial results) is not the
7313 : : tree-code of STMT, but is rather the tree-code of the original
7314 : : stmt from the pattern that STMT is replacing. I.e, in the example
7315 : : above we want to use 'widen_sum' in the loop, but 'plus' in the
7316 : : epilog.
7317 : : 2. The type (mode) we use to check available target support
7318 : : for the vector operation to be created in the *epilog*, is
7319 : : determined by the type of the reduction variable (in the example
7320 : : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7321 : : However the type (mode) we use to check available target support
7322 : : for the vector operation to be created *inside the loop*, is
7323 : : determined by the type of the other arguments to STMT (in the
7324 : : example we'd check this: optab_handler (widen_sum_optab,
7325 : : vect_short_mode)).
7326 : :
7327 : : This is contrary to "regular" reductions, in which the types of all
7328 : : the arguments are the same as the type of the reduction variable.
7329 : : For "regular" reductions we can therefore use the same vector type
7330 : : (and also the same tree-code) when generating the epilog code and
7331 : : when generating the code inside the loop. */
7332 : :
7333 : 47533 : code_helper orig_code = VECT_REDUC_INFO_CODE (reduc_info);
7334 : :
7335 : : /* If conversion might have created a conditional operation like
7336 : : IFN_COND_ADD already. Use the internal code for the following checks. */
7337 : 47533 : if (orig_code.is_internal_fn ())
7338 : : {
7339 : 3660 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7340 : 3660 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7341 : : }
7342 : :
7343 : 47533 : VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
7344 : :
7345 : 47533 : reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7346 : 47533 : if (reduction_type == TREE_CODE_REDUCTION)
7347 : : {
7348 : : /* Check whether it's ok to change the order of the computation.
7349 : : Generally, when vectorizing a reduction we change the order of the
7350 : : computation. This may change the behavior of the program in some
7351 : : cases, so we need to check that this is ok. One exception is when
7352 : : vectorizing an outer-loop: the inner-loop is executed sequentially,
7353 : : and therefore vectorizing reductions in the inner-loop during
7354 : : outer-loop vectorization is safe. Likewise when we are vectorizing
7355 : : a series of reductions using SLP and the VF is one the reductions
7356 : : are performed in scalar order. */
7357 : 46770 : if (!reduc_chain
7358 : 46770 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7359 : : ;
7360 : 46632 : else if (needs_fold_left_reduction_p (op.type, orig_code))
7361 : : {
7362 : : /* When vectorizing a reduction chain w/o SLP the reduction PHI
7363 : : is not directy used in stmt. */
7364 : 4793 : if (reduc_chain_length != 1)
7365 : : {
7366 : 67 : if (dump_enabled_p ())
7367 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7368 : : "in-order reduction chain without SLP.\n");
7369 : 67 : return false;
7370 : : }
7371 : : /* Code generation doesn't support function calls other
7372 : : than .COND_*. */
7373 : 4726 : if (!op.code.is_tree_code ()
7374 : 4840 : && !(op.code.is_internal_fn ()
7375 : 57 : && conditional_internal_fn_code (internal_fn (op.code))
7376 : : != ERROR_MARK))
7377 : : {
7378 : 10 : if (dump_enabled_p ())
7379 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7380 : : "in-order reduction chain operation not "
7381 : : "supported.\n");
7382 : 10 : return false;
7383 : : }
7384 : 4716 : VECT_REDUC_INFO_TYPE (reduc_info)
7385 : 4716 : = reduction_type = FOLD_LEFT_REDUCTION;
7386 : : }
7387 : 41839 : else if (!commutative_binary_op_p (orig_code, op.type)
7388 : 41839 : || !associative_binary_op_p (orig_code, op.type))
7389 : : {
7390 : 152 : if (dump_enabled_p ())
7391 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7392 : : "reduction: not commutative/associative\n");
7393 : 152 : return false;
7394 : : }
7395 : : }
7396 : :
7397 : 4716 : if ((reduction_type == COND_REDUCTION
7398 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7399 : : || reduction_type == CONST_COND_REDUCTION
7400 : 42588 : || reduction_type == EXTRACT_LAST_REDUCTION)
7401 : : && 1
7402 : 763 : && ncopies > 1)
7403 : : {
7404 : 276 : if (dump_enabled_p ())
7405 : 60 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7406 : : "multiple types in condition reduction.\n");
7407 : 276 : return false;
7408 : : }
7409 : :
7410 : : /* See if we can convert a mask vector to a corresponding bool data vector
7411 : : to perform the epilogue reduction. */
7412 : 47028 : tree alt_vectype_out = NULL_TREE;
7413 : 47028 : if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
7414 : : {
7415 : 938 : alt_vectype_out
7416 : 1876 : = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
7417 : 938 : TREE_TYPE (vectype_out),
7418 : : TYPE_VECTOR_SUBPARTS
7419 : : (vectype_out));
7420 : 938 : if (!alt_vectype_out
7421 : 938 : || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
7422 : 1861 : TYPE_VECTOR_SUBPARTS (vectype_out))
7423 : 1876 : || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
7424 : 15 : alt_vectype_out = NULL_TREE;
7425 : : }
7426 : :
7427 : 47028 : internal_fn reduc_fn = IFN_LAST;
7428 : 47028 : if (reduction_type == TREE_CODE_REDUCTION
7429 : 47028 : || reduction_type == FOLD_LEFT_REDUCTION
7430 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7431 : 487 : || reduction_type == CONST_COND_REDUCTION)
7432 : : {
7433 : 41939 : if (reduction_type == FOLD_LEFT_REDUCTION
7434 : 50581 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
7435 : 41939 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7436 : : {
7437 : 45979 : internal_fn sbool_fn = IFN_LAST;
7438 : 45979 : if (reduc_fn == IFN_LAST)
7439 : : ;
7440 : 44161 : else if ((!VECTOR_BOOLEAN_TYPE_P (vectype_out)
7441 : 938 : || (GET_MODE_CLASS (TYPE_MODE (vectype_out))
7442 : : == MODE_VECTOR_BOOL))
7443 : 87384 : && direct_internal_fn_supported_p (reduc_fn, vectype_out,
7444 : : OPTIMIZE_FOR_SPEED))
7445 : : ;
7446 : 10098 : else if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
7447 : 938 : && sbool_reduction_fn_for_fn (reduc_fn, &sbool_fn)
7448 : 11036 : && direct_internal_fn_supported_p (sbool_fn, vectype_out,
7449 : : OPTIMIZE_FOR_SPEED))
7450 : 59 : reduc_fn = sbool_fn;
7451 : 10039 : else if (reduction_type != FOLD_LEFT_REDUCTION
7452 : 10039 : && alt_vectype_out
7453 : 10039 : && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
7454 : : OPTIMIZE_FOR_SPEED))
7455 : 714 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7456 : : else
7457 : : {
7458 : 9325 : if (dump_enabled_p ())
7459 : 800 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7460 : : "reduc op not supported by target.\n");
7461 : :
7462 : 9325 : reduc_fn = IFN_LAST;
7463 : : }
7464 : : }
7465 : : else
7466 : : {
7467 : 676 : if (dump_enabled_p ())
7468 : 48 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7469 : : "no reduc code for scalar code.\n");
7470 : :
7471 : 676 : return false;
7472 : : }
7473 : 45979 : if (reduc_fn == IFN_LAST
7474 : 45979 : && VECTOR_BOOLEAN_TYPE_P (vectype_out))
7475 : : {
7476 : 165 : if (!alt_vectype_out)
7477 : : {
7478 : 8 : if (dump_enabled_p ())
7479 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7480 : : "cannot turn mask into bool data vector for "
7481 : : "reduction epilogue.\n");
7482 : 8 : return false;
7483 : : }
7484 : 157 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7485 : : }
7486 : : }
7487 : 373 : else if (reduction_type == COND_REDUCTION)
7488 : : {
7489 : 373 : int scalar_precision
7490 : 373 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7491 : 373 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
7492 : 373 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7493 : : vectype_out);
7494 : :
7495 : 373 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7496 : : OPTIMIZE_FOR_SPEED))
7497 : 8 : reduc_fn = IFN_REDUC_MAX;
7498 : : }
7499 : 46344 : VECT_REDUC_INFO_FN (reduc_info) = reduc_fn;
7500 : :
7501 : 46344 : if (reduction_type != EXTRACT_LAST_REDUCTION
7502 : : && reduc_fn == IFN_LAST
7503 : : && !nunits_out.is_constant ())
7504 : : {
7505 : : if (dump_enabled_p ())
7506 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7507 : : "missing target support for reduction on"
7508 : : " variable-length vectors.\n");
7509 : : return false;
7510 : : }
7511 : :
7512 : : /* For SLP reductions, see if there is a neutral value we can use. */
7513 : 46344 : tree neutral_op = NULL_TREE;
7514 : 46344 : tree initial_value = NULL_TREE;
7515 : 46344 : if (reduc_chain)
7516 : 1366 : initial_value = vect_phi_initial_value (reduc_def_phi);
7517 : 46344 : neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7518 : : orig_code, initial_value);
7519 : :
7520 : 46344 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7521 : : {
7522 : : /* We can't support in-order reductions of code such as this:
7523 : :
7524 : : for (int i = 0; i < n1; ++i)
7525 : : for (int j = 0; j < n2; ++j)
7526 : : l += a[j];
7527 : :
7528 : : since GCC effectively transforms the loop when vectorizing:
7529 : :
7530 : : for (int i = 0; i < n1 / VF; ++i)
7531 : : for (int j = 0; j < n2; ++j)
7532 : : for (int k = 0; k < VF; ++k)
7533 : : l += a[j];
7534 : :
7535 : : which is a reassociation of the original operation. */
7536 : 56 : if (dump_enabled_p ())
7537 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7538 : : "in-order double reduction not supported.\n");
7539 : :
7540 : 56 : return false;
7541 : : }
7542 : :
7543 : 46288 : if (reduction_type == FOLD_LEFT_REDUCTION
7544 : 3984 : && SLP_TREE_LANES (slp_node) > 1
7545 : 119 : && !reduc_chain)
7546 : : {
7547 : : /* We cannot use in-order reductions in this case because there is
7548 : : an implicit reassociation of the operations involved. */
7549 : 57 : if (dump_enabled_p ())
7550 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7551 : : "in-order unchained SLP reductions not supported.\n");
7552 : 57 : return false;
7553 : : }
7554 : :
7555 : : /* For double reductions, and for SLP reductions with a neutral value,
7556 : : we construct a variable-length initial vector by loading a vector
7557 : : full of the neutral value and then shift-and-inserting the start
7558 : : values into the low-numbered elements. */
7559 : 46231 : if ((double_reduc || neutral_op)
7560 : : && !nunits_out.is_constant ()
7561 : : && (SLP_TREE_LANES (slp_node) != 1 && !reduc_chain)
7562 : : && (!neutral_op
7563 : : || !operand_equal_p (neutral_op,
7564 : : vect_phi_initial_value (reduc_def_phi)))
7565 : : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7566 : : vectype_out, OPTIMIZE_FOR_SPEED))
7567 : : {
7568 : : if (dump_enabled_p ())
7569 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7570 : : "reduction on variable-length vectors requires"
7571 : : " target support for a vector-shift-and-insert"
7572 : : " operation.\n");
7573 : : return false;
7574 : : }
7575 : :
7576 : : /* Check extra constraints for variable-length unchained SLP reductions. */
7577 : 46231 : if (!reduc_chain
7578 : : && !nunits_out.is_constant ())
7579 : : {
7580 : : /* We checked above that we could build the initial vector when
7581 : : there's a neutral element value. Check here for the case in
7582 : : which each SLP statement has its own initial value and in which
7583 : : that value needs to be repeated for every instance of the
7584 : : statement within the initial vector. */
7585 : : unsigned int group_size = SLP_TREE_LANES (slp_node);
7586 : : if (!neutral_op
7587 : : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7588 : : TREE_TYPE (vectype_out)))
7589 : : {
7590 : : if (dump_enabled_p ())
7591 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7592 : : "unsupported form of SLP reduction for"
7593 : : " variable-length vectors: cannot build"
7594 : : " initial vector.\n");
7595 : : return false;
7596 : : }
7597 : : /* The epilogue code relies on the number of elements being a multiple
7598 : : of the group size. The duplicate-and-interleave approach to setting
7599 : : up the initial vector does too. */
7600 : : if (!multiple_p (nunits_out, group_size))
7601 : : {
7602 : : if (dump_enabled_p ())
7603 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7604 : : "unsupported form of SLP reduction for"
7605 : : " variable-length vectors: the vector size"
7606 : : " is not a multiple of the number of results.\n");
7607 : : return false;
7608 : : }
7609 : : }
7610 : :
7611 : 46231 : if (reduction_type == COND_REDUCTION)
7612 : : {
7613 : 373 : widest_int ni;
7614 : :
7615 : 373 : if (! max_loop_iterations (loop, &ni))
7616 : : {
7617 : 0 : if (dump_enabled_p ())
7618 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
7619 : : "loop count not known, cannot create cond "
7620 : : "reduction.\n");
7621 : 0 : return false;
7622 : : }
7623 : : /* Convert backedges to iterations. */
7624 : 373 : ni += 1;
7625 : :
7626 : : /* The additional index will be the same type as the condition. Check
7627 : : that the loop can fit into this less one (because we'll use up the
7628 : : zero slot for when there are no matches). */
7629 : 373 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7630 : 373 : if (wi::geu_p (ni, wi::to_widest (max_index)))
7631 : : {
7632 : 90 : if (dump_enabled_p ())
7633 : 54 : dump_printf_loc (MSG_NOTE, vect_location,
7634 : : "loop size is greater than data size.\n");
7635 : 90 : return false;
7636 : : }
7637 : 373 : }
7638 : :
7639 : : /* In case the vectorization factor (VF) is bigger than the number
7640 : : of elements that we can fit in a vectype (nunits), we have to generate
7641 : : more than one vector stmt - i.e - we need to "unroll" the
7642 : : vector stmt by a factor VF/nunits. For more details see documentation
7643 : : in vectorizable_operation. */
7644 : :
7645 : : /* If the reduction is used in an outer loop we need to generate
7646 : : VF intermediate results, like so (e.g. for ncopies=2):
7647 : : r0 = phi (init, r0)
7648 : : r1 = phi (init, r1)
7649 : : r0 = x0 + r0;
7650 : : r1 = x1 + r1;
7651 : : (i.e. we generate VF results in 2 registers).
7652 : : In this case we have a separate def-use cycle for each copy, and therefore
7653 : : for each copy we get the vector def for the reduction variable from the
7654 : : respective phi node created for this copy.
7655 : :
7656 : : Otherwise (the reduction is unused in the loop nest), we can combine
7657 : : together intermediate results, like so (e.g. for ncopies=2):
7658 : : r = phi (init, r)
7659 : : r = x0 + r;
7660 : : r = x1 + r;
7661 : : (i.e. we generate VF/2 results in a single register).
7662 : : In this case for each copy we get the vector def for the reduction variable
7663 : : from the vectorized reduction operation generated in the previous iteration.
7664 : :
7665 : : This only works when we see both the reduction PHI and its only consumer
7666 : : in vectorizable_reduction and there are no intermediate stmts
7667 : : participating. When unrolling we want each unrolled iteration to have its
7668 : : own reduction accumulator since one of the main goals of unrolling a
7669 : : reduction is to reduce the aggregate loop-carried latency. */
7670 : 46141 : if (ncopies > 1
7671 : 46141 : && !reduc_chain
7672 : 5383 : && SLP_TREE_LANES (slp_node) == 1
7673 : 5237 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7674 : 5218 : && reduc_chain_length == 1
7675 : 4917 : && loop_vinfo->suggested_unroll_factor == 1)
7676 : 46141 : single_defuse_cycle = true;
7677 : :
7678 : 46141 : if (single_defuse_cycle && !lane_reducing)
7679 : : {
7680 : 4342 : gcc_assert (op.code != COND_EXPR);
7681 : :
7682 : : /* 4. check support for the operation in the loop
7683 : :
7684 : : This isn't necessary for the lane reduction codes, since they
7685 : : can only be produced by pattern matching, and it's up to the
7686 : : pattern matcher to test for support. The main reason for
7687 : : specifically skipping this step is to avoid rechecking whether
7688 : : mixed-sign dot-products can be implemented using signed
7689 : : dot-products. */
7690 : 4342 : machine_mode vec_mode = TYPE_MODE (vectype_in);
7691 : 4342 : if (!directly_supported_p (op.code, vectype_in, optab_vector))
7692 : : {
7693 : 701 : if (dump_enabled_p ())
7694 : 10 : dump_printf (MSG_NOTE, "op not supported by target.\n");
7695 : 1402 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7696 : 701 : || !vect_can_vectorize_without_simd_p (op.code))
7697 : : single_defuse_cycle = false;
7698 : : else
7699 : 5 : if (dump_enabled_p ())
7700 : 0 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7701 : : }
7702 : :
7703 : 4342 : if (vect_emulated_vector_p (vectype_in)
7704 : 4342 : && !vect_can_vectorize_without_simd_p (op.code))
7705 : : {
7706 : 0 : if (dump_enabled_p ())
7707 : 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
7708 : 0 : return false;
7709 : : }
7710 : : }
7711 : 46141 : if (dump_enabled_p () && single_defuse_cycle)
7712 : 636 : dump_printf_loc (MSG_NOTE, vect_location,
7713 : : "using single def-use cycle for reduction by reducing "
7714 : : "multiple vectors to one in the loop body\n");
7715 : 46141 : VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7716 : :
7717 : : /* For lane-reducing operation, the below processing related to single
7718 : : defuse-cycle will be done in its own vectorizable function. One more
7719 : : thing to note is that the operation must not be involved in fold-left
7720 : : reduction. */
7721 : 46141 : single_defuse_cycle &= !lane_reducing;
7722 : :
7723 : 46141 : if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
7724 : 24416 : for (i = 0; i < (int) op.num_ops; i++)
7725 : 16900 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7726 : : {
7727 : 0 : if (dump_enabled_p ())
7728 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7729 : : "incompatible vector types for invariants\n");
7730 : 0 : return false;
7731 : : }
7732 : :
7733 : 46141 : vect_model_reduction_cost (loop_vinfo, slp_for_stmt_info, reduc_fn,
7734 : : reduction_type, ncopies, cost_vec);
7735 : : /* Cost the reduction op inside the loop if transformed via
7736 : : vect_transform_reduction for non-lane-reducing operation. Otherwise
7737 : : this is costed by the separate vectorizable_* routines. */
7738 : 46141 : if (single_defuse_cycle)
7739 : 3646 : record_stmt_cost (cost_vec, ncopies, vector_stmt,
7740 : : slp_for_stmt_info, 0, vect_body);
7741 : :
7742 : 46141 : if (dump_enabled_p ()
7743 : 46141 : && reduction_type == FOLD_LEFT_REDUCTION)
7744 : 212 : dump_printf_loc (MSG_NOTE, vect_location,
7745 : : "using an in-order (fold-left) reduction.\n");
7746 : 46141 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7747 : :
7748 : : /* All but single defuse-cycle optimized and fold-left reductions go
7749 : : through their own vectorizable_* routines. */
7750 : 46141 : stmt_vec_info tem
7751 : 46141 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (slp_node_instance));
7752 : 46141 : if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
7753 : 38625 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7754 : : else
7755 : : {
7756 : 7516 : STMT_VINFO_DEF_TYPE (tem) = vect_reduction_def;
7757 : 7516 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7758 : 3349 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7759 : : slp_node, op.code, op.type,
7760 : : vectype_in);
7761 : : }
7762 : : return true;
7763 : : }
7764 : :
7765 : : /* STMT_INFO is a dot-product reduction whose multiplication operands
7766 : : have different signs. Emit a sequence to emulate the operation
7767 : : using a series of signed DOT_PROD_EXPRs and return the last
7768 : : statement generated. VEC_DEST is the result of the vector operation
7769 : : and VOP lists its inputs. */
7770 : :
7771 : : static gassign *
7772 : 4 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7773 : : gimple_stmt_iterator *gsi, tree vec_dest,
7774 : : tree vop[3])
7775 : : {
7776 : 4 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7777 : 4 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7778 : 4 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
7779 : 4 : gimple *new_stmt;
7780 : :
7781 : : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7782 : 4 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7783 : 0 : std::swap (vop[0], vop[1]);
7784 : :
7785 : : /* Convert all inputs to signed types. */
7786 : 16 : for (int i = 0; i < 3; ++i)
7787 : 12 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7788 : : {
7789 : 4 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7790 : 4 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7791 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7792 : 4 : vop[i] = tmp;
7793 : : }
7794 : :
7795 : : /* In the comments below we assume 8-bit inputs for simplicity,
7796 : : but the approach works for any full integer type. */
7797 : :
7798 : : /* Create a vector of -128. */
7799 : 4 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7800 : 4 : tree min_narrow = build_vector_from_val (narrow_vectype,
7801 : : min_narrow_elttype);
7802 : :
7803 : : /* Create a vector of 64. */
7804 : 4 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7805 : 4 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7806 : 4 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7807 : :
7808 : : /* Emit: SUB_RES = VOP[0] - 128. */
7809 : 4 : tree sub_res = make_ssa_name (narrow_vectype);
7810 : 4 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7811 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7812 : :
7813 : : /* Emit:
7814 : :
7815 : : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7816 : : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7817 : : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7818 : :
7819 : : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7820 : : Doing the two 64 * y steps first allows more time to compute x. */
7821 : 4 : tree stage1 = make_ssa_name (wide_vectype);
7822 : 4 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7823 : : vop[1], half_narrow, vop[2]);
7824 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7825 : :
7826 : 4 : tree stage2 = make_ssa_name (wide_vectype);
7827 : 4 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7828 : : vop[1], half_narrow, stage1);
7829 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7830 : :
7831 : 4 : tree stage3 = make_ssa_name (wide_vectype);
7832 : 4 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7833 : : sub_res, vop[1], stage2);
7834 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7835 : :
7836 : : /* Convert STAGE3 to the reduction type. */
7837 : 4 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7838 : 4 : }
7839 : :
7840 : : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7841 : : value. */
7842 : :
7843 : : bool
7844 : 2547 : vect_transform_reduction (loop_vec_info loop_vinfo,
7845 : : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7846 : : slp_tree slp_node)
7847 : : {
7848 : 2547 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
7849 : 2547 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7850 : 2547 : unsigned vec_num;
7851 : :
7852 : 2547 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
7853 : :
7854 : 2547 : if (nested_in_vect_loop_p (loop, stmt_info))
7855 : : {
7856 : 0 : loop = loop->inner;
7857 : 0 : gcc_assert (VECT_REDUC_INFO_DEF_TYPE (reduc_info)
7858 : : == vect_double_reduction_def);
7859 : : }
7860 : :
7861 : 2547 : gimple_match_op op;
7862 : 2547 : if (!gimple_extract_op (stmt_info->stmt, &op))
7863 : 0 : gcc_unreachable ();
7864 : :
7865 : : /* All uses but the last are expected to be defined in the loop.
7866 : : The last use is the reduction variable. In case of nested cycle this
7867 : : assumption is not true: we use reduc_index to record the index of the
7868 : : reduction variable. */
7869 : 2547 : int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
7870 : 2547 : tree vectype_in = SLP_TREE_VECTYPE (slp_node);
7871 : 2547 : if (lane_reducing_op_p (op.code))
7872 : 255 : vectype_in = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0]);
7873 : :
7874 : 2547 : vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
7875 : :
7876 : 2547 : code_helper code = canonicalize_code (op.code, op.type);
7877 : 2547 : internal_fn cond_fn
7878 : 468 : = ((code.is_internal_fn ()
7879 : 468 : && internal_fn_mask_index ((internal_fn)code) != -1)
7880 : 2547 : ? (internal_fn)code : get_conditional_internal_fn (code, op.type));
7881 : :
7882 : 2547 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7883 : 2547 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7884 : 2547 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7885 : :
7886 : : /* Transform. */
7887 : 2547 : tree new_temp = NULL_TREE;
7888 : 17829 : auto_vec<tree> vec_oprnds[3];
7889 : :
7890 : 2547 : if (dump_enabled_p ())
7891 : 726 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7892 : :
7893 : : /* A binary COND_OP reduction must have the same definition and else
7894 : : value. */
7895 : 3015 : bool cond_fn_p = code.is_internal_fn ()
7896 : 468 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
7897 : 468 : if (cond_fn_p)
7898 : : {
7899 : 468 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
7900 : : || code == IFN_COND_MUL || code == IFN_COND_AND
7901 : : || code == IFN_COND_IOR || code == IFN_COND_XOR
7902 : : || code == IFN_COND_MIN || code == IFN_COND_MAX);
7903 : 468 : gcc_assert (op.num_ops == 4
7904 : : && (op.ops[reduc_index]
7905 : : == op.ops[internal_fn_else_index ((internal_fn) code)]));
7906 : : }
7907 : :
7908 : 2547 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7909 : :
7910 : 2547 : vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7911 : 2547 : if (reduction_type == FOLD_LEFT_REDUCTION)
7912 : : {
7913 : 830 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
7914 : 830 : gcc_assert (code.is_tree_code () || cond_fn_p);
7915 : 830 : return vectorize_fold_left_reduction
7916 : 830 : (loop_vinfo, stmt_info, gsi, slp_node,
7917 : 830 : code, reduc_fn, op.num_ops, vectype_in,
7918 : 830 : reduc_index, masks, lens);
7919 : : }
7920 : :
7921 : 1717 : bool single_defuse_cycle = VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
7922 : 1717 : bool lane_reducing = lane_reducing_op_p (code);
7923 : 1462 : gcc_assert (single_defuse_cycle || lane_reducing);
7924 : :
7925 : 1717 : if (lane_reducing)
7926 : : {
7927 : : /* The last operand of lane-reducing op is for reduction. */
7928 : 255 : gcc_assert (reduc_index == (int) op.num_ops - 1);
7929 : : }
7930 : :
7931 : : /* Create the destination vector */
7932 : 1717 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
7933 : 1717 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7934 : :
7935 : : /* Get NCOPIES vector definitions for all operands except the reduction
7936 : : definition. */
7937 : 1717 : if (!cond_fn_p)
7938 : : {
7939 : 1264 : gcc_assert (reduc_index >= 0 && reduc_index <= 2);
7940 : 2108 : vect_get_vec_defs (loop_vinfo, slp_node,
7941 : 1264 : single_defuse_cycle && reduc_index == 0
7942 : : ? NULL_TREE : op.ops[0], &vec_oprnds[0],
7943 : 1264 : single_defuse_cycle && reduc_index == 1
7944 : : ? NULL_TREE : op.ops[1], &vec_oprnds[1],
7945 : 1264 : op.num_ops == 3
7946 : 255 : && !(single_defuse_cycle && reduc_index == 2)
7947 : : ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
7948 : : }
7949 : : else
7950 : : {
7951 : : /* For a conditional operation pass the truth type as mask
7952 : : vectype. */
7953 : 453 : gcc_assert (single_defuse_cycle
7954 : : && (reduc_index == 1 || reduc_index == 2));
7955 : 453 : vect_get_vec_defs (loop_vinfo, slp_node, op.ops[0],
7956 : : &vec_oprnds[0],
7957 : : reduc_index == 1 ? NULL_TREE : op.ops[1],
7958 : : &vec_oprnds[1],
7959 : : reduc_index == 2 ? NULL_TREE : op.ops[2],
7960 : : &vec_oprnds[2]);
7961 : : }
7962 : :
7963 : : /* For single def-use cycles get one copy of the vectorized reduction
7964 : : definition. */
7965 : 1717 : if (single_defuse_cycle)
7966 : : {
7967 : 1632 : vect_get_vec_defs (loop_vinfo, slp_node,
7968 : : reduc_index == 0 ? op.ops[0] : NULL_TREE,
7969 : : &vec_oprnds[0],
7970 : : reduc_index == 1 ? op.ops[1] : NULL_TREE,
7971 : : &vec_oprnds[1],
7972 : : reduc_index == 2 ? op.ops[2] : NULL_TREE,
7973 : : &vec_oprnds[2]);
7974 : : }
7975 : 85 : else if (lane_reducing)
7976 : : {
7977 : : /* For normal reduction, consistency between vectorized def/use is
7978 : : naturally ensured when mapping from scalar statement. But if lane-
7979 : : reducing op is involved in reduction, thing would become somewhat
7980 : : complicated in that the op's result and operand for accumulation are
7981 : : limited to less lanes than other operands, which certainly causes
7982 : : def/use mismatch on adjacent statements around the op if do not have
7983 : : any kind of specific adjustment. One approach is to refit lane-
7984 : : reducing op in the way of introducing new trivial pass-through copies
7985 : : to fix possible def/use gap, so as to make it behave like a normal op.
7986 : : And vector reduction PHIs are always generated to the full extent, no
7987 : : matter lane-reducing op exists or not. If some copies or PHIs are
7988 : : actually superfluous, they would be cleaned up by passes after
7989 : : vectorization. An example for single-lane slp, lane-reducing ops
7990 : : with mixed input vectypes in a reduction chain, is given as below.
7991 : : Similarly, this handling is applicable for multiple-lane slp as well.
7992 : :
7993 : : int sum = 1;
7994 : : for (i)
7995 : : {
7996 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
7997 : : sum += w[i]; // widen-sum <vector(16) char>
7998 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
7999 : : sum += n[i]; // normal <vector(4) int>
8000 : : }
8001 : :
8002 : : The vector size is 128-bit,vectorization factor is 16. Reduction
8003 : : statements would be transformed as:
8004 : :
8005 : : vector<4> int sum_v0 = { 0, 0, 0, 1 };
8006 : : vector<4> int sum_v1 = { 0, 0, 0, 0 };
8007 : : vector<4> int sum_v2 = { 0, 0, 0, 0 };
8008 : : vector<4> int sum_v3 = { 0, 0, 0, 0 };
8009 : :
8010 : : for (i / 16)
8011 : : {
8012 : : sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8013 : : sum_v1 = sum_v1; // copy
8014 : : sum_v2 = sum_v2; // copy
8015 : : sum_v3 = sum_v3; // copy
8016 : :
8017 : : sum_v0 = sum_v0; // copy
8018 : : sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8019 : : sum_v2 = sum_v2; // copy
8020 : : sum_v3 = sum_v3; // copy
8021 : :
8022 : : sum_v0 = sum_v0; // copy
8023 : : sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8024 : : sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8025 : : sum_v3 = sum_v3; // copy
8026 : :
8027 : : sum_v0 += n_v0[i: 0 ~ 3 ];
8028 : : sum_v1 += n_v1[i: 4 ~ 7 ];
8029 : : sum_v2 += n_v2[i: 8 ~ 11];
8030 : : sum_v3 += n_v3[i: 12 ~ 15];
8031 : : }
8032 : :
8033 : : Moreover, for a higher instruction parallelism in final vectorized
8034 : : loop, it is considered to make those effective vector lane-reducing
8035 : : ops be distributed evenly among all def-use cycles. In the above
8036 : : example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8037 : : cycles, instruction dependency among them could be eliminated. */
8038 : 85 : unsigned effec_ncopies = vec_oprnds[0].length ();
8039 : 85 : unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8040 : :
8041 : 85 : gcc_assert (effec_ncopies <= total_ncopies);
8042 : :
8043 : 85 : if (effec_ncopies < total_ncopies)
8044 : : {
8045 : 255 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8046 : : {
8047 : 340 : gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8048 : 170 : vec_oprnds[i].safe_grow_cleared (total_ncopies);
8049 : : }
8050 : : }
8051 : :
8052 : 85 : tree reduc_vectype_in = vectype_in;
8053 : 85 : gcc_assert (reduc_vectype_in);
8054 : :
8055 : 85 : unsigned effec_reduc_ncopies
8056 : 85 : = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
8057 : :
8058 : 85 : gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8059 : :
8060 : 85 : if (effec_ncopies < effec_reduc_ncopies)
8061 : : {
8062 : : /* Find suitable def-use cycles to generate vectorized statements
8063 : : into, and reorder operands based on the selection. */
8064 : 0 : unsigned curr_pos = VECT_REDUC_INFO_RESULT_POS (reduc_info);
8065 : 0 : unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8066 : :
8067 : 0 : gcc_assert (curr_pos < effec_reduc_ncopies);
8068 : 0 : VECT_REDUC_INFO_RESULT_POS (reduc_info) = next_pos;
8069 : :
8070 : 0 : if (curr_pos)
8071 : : {
8072 : 0 : unsigned count = effec_reduc_ncopies - effec_ncopies;
8073 : 0 : unsigned start = curr_pos - count;
8074 : :
8075 : 0 : if ((int) start < 0)
8076 : : {
8077 : 0 : count = curr_pos;
8078 : 0 : start = 0;
8079 : : }
8080 : :
8081 : 0 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8082 : : {
8083 : 0 : for (unsigned j = effec_ncopies; j > start; j--)
8084 : : {
8085 : 0 : unsigned k = j - 1;
8086 : 0 : std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8087 : 0 : gcc_assert (!vec_oprnds[i][k]);
8088 : : }
8089 : : }
8090 : : }
8091 : : }
8092 : : }
8093 : :
8094 : 1717 : bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (slp_node);
8095 : 2930 : unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8096 : 1717 : unsigned mask_index = 0;
8097 : :
8098 : 7540 : for (unsigned i = 0; i < num; ++i)
8099 : : {
8100 : 5823 : gimple *new_stmt;
8101 : 5823 : tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8102 : 5823 : if (!vop[0] || !vop[1])
8103 : : {
8104 : 456 : tree reduc_vop = vec_oprnds[reduc_index][i];
8105 : :
8106 : : /* If could not generate an effective vector statement for current
8107 : : portion of reduction operand, insert a trivial copy to simply
8108 : : handle over the operand to other dependent statements. */
8109 : 456 : gcc_assert (reduc_vop);
8110 : :
8111 : 456 : if (TREE_CODE (reduc_vop) == SSA_NAME
8112 : 456 : && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8113 : 456 : new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8114 : : else
8115 : : {
8116 : 0 : new_temp = make_ssa_name (vec_dest);
8117 : 0 : new_stmt = gimple_build_assign (new_temp, reduc_vop);
8118 : 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8119 : : gsi);
8120 : : }
8121 : : }
8122 : 5367 : else if (masked_loop_p && !mask_by_cond_expr)
8123 : : {
8124 : : /* No conditional ifns have been defined for lane-reducing op
8125 : : yet. */
8126 : 16 : gcc_assert (!lane_reducing);
8127 : :
8128 : 16 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8129 : : vec_num, vectype_in,
8130 : : mask_index++);
8131 : 16 : gcall *call;
8132 : 24 : if (code.is_internal_fn () && cond_fn_p)
8133 : : {
8134 : 16 : gcc_assert (op.num_ops >= 3
8135 : : && internal_fn_mask_index (internal_fn (code)) == 0);
8136 : 8 : vop[2] = vec_oprnds[2][i];
8137 : 8 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask),
8138 : : mask, vop[0], gsi);
8139 : 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[1],
8140 : : vop[2], vop[reduc_index]);
8141 : : }
8142 : : else
8143 : : {
8144 : 8 : gcc_assert (code.is_tree_code ());
8145 : 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[0],
8146 : : vop[1], vop[reduc_index]);
8147 : : }
8148 : 16 : new_temp = make_ssa_name (vec_dest, call);
8149 : 16 : gimple_call_set_lhs (call, new_temp);
8150 : 16 : gimple_call_set_nothrow (call, true);
8151 : 16 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8152 : 16 : new_stmt = call;
8153 : : }
8154 : : else
8155 : : {
8156 : 5351 : if (op.num_ops >= 3)
8157 : 1753 : vop[2] = vec_oprnds[2][i];
8158 : :
8159 : 5351 : if (masked_loop_p && mask_by_cond_expr)
8160 : : {
8161 : 4 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8162 : : vec_num, vectype_in,
8163 : : mask_index++);
8164 : 4 : build_vect_cond_expr (code, vop, mask, gsi);
8165 : : }
8166 : :
8167 : 5351 : if (emulated_mixed_dot_prod)
8168 : 4 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8169 : : vec_dest, vop);
8170 : :
8171 : 6689 : else if (code.is_internal_fn () && !cond_fn_p)
8172 : 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8173 : : op.num_ops,
8174 : : vop[0], vop[1], vop[2]);
8175 : 6689 : else if (code.is_internal_fn () && cond_fn_p)
8176 : 1342 : new_stmt = gimple_build_call_internal (internal_fn (code),
8177 : : op.num_ops,
8178 : : vop[0], vop[1], vop[2],
8179 : : vop[reduc_index]);
8180 : : else
8181 : 4005 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8182 : : vop[0], vop[1], vop[2]);
8183 : 5351 : new_temp = make_ssa_name (vec_dest, new_stmt);
8184 : 5351 : gimple_set_lhs (new_stmt, new_temp);
8185 : 5351 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8186 : : }
8187 : :
8188 : 5823 : if (single_defuse_cycle && i < num - 1)
8189 : 3507 : vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8190 : : else
8191 : 2316 : slp_node->push_vec_def (new_stmt);
8192 : : }
8193 : :
8194 : : return true;
8195 : 10188 : }
8196 : :
8197 : : /* Transform phase of a cycle PHI. */
8198 : :
8199 : : bool
8200 : 23342 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8201 : : stmt_vec_info stmt_info,
8202 : : slp_tree slp_node, slp_instance slp_node_instance)
8203 : : {
8204 : 23342 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
8205 : 23342 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8206 : 23342 : int i;
8207 : 23342 : bool nested_cycle = false;
8208 : 23342 : int vec_num;
8209 : :
8210 : 23458 : if (nested_in_vect_loop_p (loop, stmt_info))
8211 : : {
8212 : : loop = loop->inner;
8213 : : nested_cycle = true;
8214 : : }
8215 : :
8216 : 23342 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
8217 : 23342 : if (reduc_info
8218 : 22753 : && (VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8219 : 22753 : || VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION))
8220 : : /* Leave the scalar phi in place. */
8221 : : return true;
8222 : :
8223 : 21923 : if (reduc_info && reduc_info->is_reduc_chain && dump_enabled_p ())
8224 : 109 : dump_printf_loc (MSG_NOTE, vect_location,
8225 : : "vectorizing a reduction chain\n");
8226 : :
8227 : 22512 : vec_num = vect_get_num_copies (loop_vinfo, slp_node);
8228 : :
8229 : : /* Check whether we should use a single PHI node and accumulate
8230 : : vectors to one before the backedge. */
8231 : 22512 : if (reduc_info && VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info))
8232 : 22512 : vec_num = 1;
8233 : :
8234 : : /* Create the destination vector */
8235 : 22512 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8236 : 22512 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8237 : : vectype_out);
8238 : :
8239 : : /* Get the loop-entry arguments. */
8240 : 22512 : tree vec_initial_def = NULL_TREE;
8241 : 22512 : auto_vec<tree> vec_initial_defs;
8242 : 22512 : vec_initial_defs.reserve (vec_num);
8243 : : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8244 : : and we can't use zero for induc_val, use initial_def. Similarly
8245 : : for REDUC_MIN and initial_def larger than the base. */
8246 : 22512 : if (reduc_info
8247 : 21923 : && VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8248 : : {
8249 : 66 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8250 : 66 : tree initial_def = vect_phi_initial_value (phi);
8251 : 66 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).safe_push (initial_def);
8252 : 66 : tree induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
8253 : 66 : if (TREE_CODE (initial_def) == INTEGER_CST
8254 : 64 : && !integer_zerop (induc_val)
8255 : 130 : && ((VECT_REDUC_INFO_CODE (reduc_info) == MAX_EXPR
8256 : 44 : && tree_int_cst_lt (initial_def, induc_val))
8257 : 61 : || (VECT_REDUC_INFO_CODE (reduc_info) == MIN_EXPR
8258 : 20 : && tree_int_cst_lt (induc_val, initial_def))))
8259 : : {
8260 : 3 : induc_val = initial_def;
8261 : : /* Communicate we used the initial_def to epilouge
8262 : : generation. */
8263 : 3 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8264 : : }
8265 : 66 : vec_initial_defs.quick_push
8266 : 66 : (build_vector_from_val (vectype_out, induc_val));
8267 : 66 : }
8268 : 22446 : else if (nested_cycle)
8269 : : {
8270 : 670 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8271 : 670 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8272 : : &vec_initial_defs);
8273 : : }
8274 : : else
8275 : : {
8276 : 21776 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8277 : 21776 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
8278 : 21776 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8279 : :
8280 : 21776 : unsigned int num_phis = stmts.length ();
8281 : 21776 : if (reduc_info->is_reduc_chain)
8282 : 179 : num_phis = 1;
8283 : 21776 : initial_values.reserve (num_phis);
8284 : 44000 : for (unsigned int i = 0; i < num_phis; ++i)
8285 : : {
8286 : 22224 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8287 : 22224 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8288 : : }
8289 : 21776 : if (vec_num == 1)
8290 : 21202 : vect_find_reusable_accumulator (loop_vinfo, reduc_info, vectype_out);
8291 : 21776 : if (!initial_values.is_empty ())
8292 : : {
8293 : 21561 : tree initial_value
8294 : 42902 : = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8295 : 21561 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
8296 : 21561 : tree neutral_op
8297 : 21561 : = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8298 : : code, initial_value);
8299 : : /* Try to simplify the vector initialization by applying an
8300 : : adjustment after the reduction has been performed. This
8301 : : can also break a critical path but on the other hand
8302 : : requires to keep the initial value live across the loop. */
8303 : 21561 : if (neutral_op
8304 : 21470 : && initial_values.length () == 1
8305 : 21270 : && !VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
8306 : 17333 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8307 : 38816 : && !operand_equal_p (neutral_op, initial_values[0]))
8308 : : {
8309 : 12135 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info)
8310 : 12135 : = initial_values[0];
8311 : 12135 : initial_values[0] = neutral_op;
8312 : : }
8313 : 43122 : get_initial_defs_for_reduction (loop_vinfo, reduc_info, vectype_out,
8314 : : &vec_initial_defs, vec_num,
8315 : : stmts.length (), neutral_op);
8316 : : }
8317 : : }
8318 : :
8319 : 22512 : if (vec_initial_def)
8320 : : {
8321 : 0 : vec_initial_defs.create (1);
8322 : 0 : vec_initial_defs.quick_push (vec_initial_def);
8323 : : }
8324 : :
8325 : 22512 : if (reduc_info)
8326 : 21923 : if (auto *accumulator = VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
8327 : : {
8328 : 4169 : tree def = accumulator->reduc_input;
8329 : 4169 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8330 : : {
8331 : 4166 : unsigned int nreduc;
8332 : 8332 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8333 : 4166 : (TREE_TYPE (def)),
8334 : 4166 : TYPE_VECTOR_SUBPARTS (vectype_out),
8335 : : &nreduc);
8336 : 0 : gcc_assert (res);
8337 : 4166 : gimple_seq stmts = NULL;
8338 : : /* Reduce the single vector to a smaller one. */
8339 : 4166 : if (nreduc != 1)
8340 : : {
8341 : : /* Perform the reduction in the appropriate type. */
8342 : 4166 : tree rvectype = vectype_out;
8343 : 4166 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8344 : 4166 : TREE_TYPE (TREE_TYPE (def))))
8345 : 233 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8346 : : TYPE_VECTOR_SUBPARTS
8347 : 466 : (vectype_out));
8348 : 4166 : def = vect_create_partial_epilog (def, rvectype,
8349 : : VECT_REDUC_INFO_CODE
8350 : : (reduc_info),
8351 : : &stmts);
8352 : : }
8353 : : /* The epilogue loop might use a different vector mode, like
8354 : : VNx2DI vs. V2DI. */
8355 : 4166 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8356 : : {
8357 : 0 : tree reduc_type = build_vector_type_for_mode
8358 : 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8359 : 0 : def = gimple_convert (&stmts, reduc_type, def);
8360 : : }
8361 : : /* Adjust the input so we pick up the partially reduced value
8362 : : for the skip edge in vect_create_epilog_for_reduction. */
8363 : 4166 : accumulator->reduc_input = def;
8364 : : /* And the reduction could be carried out using a different sign. */
8365 : 4166 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8366 : 233 : def = gimple_convert (&stmts, vectype_out, def);
8367 : 4166 : edge e;
8368 : 4166 : if ((e = loop_vinfo->main_loop_edge)
8369 : 4166 : || (e = loop_vinfo->skip_this_loop_edge))
8370 : : {
8371 : : /* While we'd like to insert on the edge this will split
8372 : : blocks and disturb bookkeeping, we also will eventually
8373 : : need this on the skip edge. Rely on sinking to
8374 : : fixup optimal placement and insert in the pred. */
8375 : 3951 : gimple_stmt_iterator gsi = gsi_last_bb (e->src);
8376 : : /* Insert before a cond that eventually skips the
8377 : : epilogue. */
8378 : 3951 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8379 : 3934 : gsi_prev (&gsi);
8380 : 3951 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8381 : : }
8382 : : else
8383 : 215 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8384 : : stmts);
8385 : : }
8386 : 4169 : if (loop_vinfo->main_loop_edge)
8387 : 3954 : vec_initial_defs[0]
8388 : 3954 : = vect_get_main_loop_result (loop_vinfo, def,
8389 : 3954 : vec_initial_defs[0]);
8390 : : else
8391 : 215 : vec_initial_defs.safe_push (def);
8392 : : }
8393 : :
8394 : : /* Generate the reduction PHIs upfront. */
8395 : 46739 : for (i = 0; i < vec_num; i++)
8396 : : {
8397 : 24227 : tree vec_init_def = vec_initial_defs[i];
8398 : : /* Create the reduction-phi that defines the reduction
8399 : : operand. */
8400 : 24227 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
8401 : 24227 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8402 : : UNKNOWN_LOCATION);
8403 : :
8404 : : /* The loop-latch arg is set in epilogue processing. */
8405 : :
8406 : 24227 : slp_node->push_vec_def (new_phi);
8407 : : }
8408 : :
8409 : 22512 : return true;
8410 : 22512 : }
8411 : :
8412 : : /* Vectorizes LC PHIs. */
8413 : :
8414 : : bool
8415 : 156864 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
8416 : : stmt_vec_info stmt_info,
8417 : : slp_tree slp_node)
8418 : : {
8419 : 156864 : if (!loop_vinfo
8420 : 156864 : || !is_a <gphi *> (stmt_info->stmt)
8421 : 187628 : || gimple_phi_num_args (stmt_info->stmt) != 1)
8422 : : return false;
8423 : :
8424 : 704 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8425 : 0 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8426 : : return false;
8427 : :
8428 : : /* Deal with copies from externs or constants that disguise as
8429 : : loop-closed PHI nodes (PR97886). */
8430 : 704 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8431 : : SLP_TREE_VECTYPE (slp_node)))
8432 : : {
8433 : 0 : if (dump_enabled_p ())
8434 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8435 : : "incompatible vector types for invariants\n");
8436 : 0 : return false;
8437 : : }
8438 : :
8439 : : /* ??? This can happen with data vs. mask uses of boolean. */
8440 : 704 : if (!useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
8441 : 704 : SLP_TREE_VECTYPE
8442 : : (SLP_TREE_CHILDREN (slp_node)[0])))
8443 : : {
8444 : 0 : if (dump_enabled_p ())
8445 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8446 : : "missed mask promotion\n");
8447 : 0 : return false;
8448 : : }
8449 : :
8450 : 704 : SLP_TREE_TYPE (slp_node) = lc_phi_info_type;
8451 : 704 : return true;
8452 : : }
8453 : :
8454 : : bool
8455 : 447 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
8456 : : stmt_vec_info stmt_info,
8457 : : slp_tree slp_node)
8458 : : {
8459 : :
8460 : 447 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8461 : 447 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8462 : 447 : basic_block bb = gimple_bb (stmt_info->stmt);
8463 : 447 : edge e = single_pred_edge (bb);
8464 : 447 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8465 : 447 : auto_vec<tree> vec_oprnds;
8466 : 894 : vect_get_vec_defs (loop_vinfo, slp_node,
8467 : 447 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8468 : 1001 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
8469 : : {
8470 : : /* Create the vectorized LC PHI node. */
8471 : 554 : gphi *new_phi = create_phi_node (vec_dest, bb);
8472 : 554 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8473 : 554 : slp_node->push_vec_def (new_phi);
8474 : : }
8475 : :
8476 : 447 : return true;
8477 : 447 : }
8478 : :
8479 : : /* Vectorizes PHIs. */
8480 : :
8481 : : bool
8482 : 139658 : vectorizable_phi (bb_vec_info vinfo,
8483 : : stmt_vec_info stmt_info,
8484 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8485 : : {
8486 : 139658 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8487 : : return false;
8488 : :
8489 : 68431 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8490 : : return false;
8491 : :
8492 : 68431 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8493 : :
8494 : 68431 : if (cost_vec) /* transformation not required. */
8495 : : {
8496 : : slp_tree child;
8497 : : unsigned i;
8498 : 186418 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8499 : 131432 : if (!child)
8500 : : {
8501 : 0 : if (dump_enabled_p ())
8502 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8503 : : "PHI node with unvectorized backedge def\n");
8504 : 0 : return false;
8505 : : }
8506 : 131432 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8507 : : {
8508 : 18 : if (dump_enabled_p ())
8509 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8510 : : "incompatible vector types for invariants\n");
8511 : 18 : return false;
8512 : : }
8513 : 131414 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8514 : 131414 : && !useless_type_conversion_p (vectype,
8515 : : SLP_TREE_VECTYPE (child)))
8516 : : {
8517 : : /* With bools we can have mask and non-mask precision vectors
8518 : : or different non-mask precisions. while pattern recog is
8519 : : supposed to guarantee consistency here bugs in it can cause
8520 : : mismatches (PR103489 and PR103800 for example).
8521 : : Deal with them here instead of ICEing later. */
8522 : 18 : if (dump_enabled_p ())
8523 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8524 : : "incompatible vector type setup from "
8525 : : "bool pattern detection\n");
8526 : 18 : return false;
8527 : : }
8528 : :
8529 : : /* For single-argument PHIs assume coalescing which means zero cost
8530 : : for the scalar and the vector PHIs. This avoids artificially
8531 : : favoring the vector path (but may pessimize it in some cases). */
8532 : 54986 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8533 : 50064 : record_stmt_cost (cost_vec, vect_get_num_copies (vinfo, slp_node),
8534 : : vector_stmt, slp_node, vectype, 0, vect_body);
8535 : 54986 : SLP_TREE_TYPE (slp_node) = phi_info_type;
8536 : 54986 : return true;
8537 : : }
8538 : :
8539 : 13409 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8540 : 13409 : basic_block bb = gimple_bb (stmt_info->stmt);
8541 : 13409 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8542 : 13409 : auto_vec<gphi *> new_phis;
8543 : 48360 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8544 : : {
8545 : 34951 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8546 : :
8547 : : /* Skip not yet vectorized defs. */
8548 : 35321 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8549 : 34951 : && SLP_TREE_VEC_DEFS (child).is_empty ())
8550 : 370 : continue;
8551 : :
8552 : 34581 : auto_vec<tree> vec_oprnds;
8553 : 34581 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8554 : 34581 : if (!new_phis.exists ())
8555 : : {
8556 : 13409 : new_phis.create (vec_oprnds.length ());
8557 : 28355 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8558 : : {
8559 : : /* Create the vectorized LC PHI node. */
8560 : 14946 : new_phis.quick_push (create_phi_node (vec_dest, bb));
8561 : 14946 : slp_node->push_vec_def (new_phis[j]);
8562 : : }
8563 : : }
8564 : 34581 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8565 : 74818 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8566 : 40237 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8567 : 34581 : }
8568 : : /* We should have at least one already vectorized child. */
8569 : 13409 : gcc_assert (new_phis.exists ());
8570 : :
8571 : 13409 : return true;
8572 : 13409 : }
8573 : :
8574 : : /* Vectorizes first order recurrences. An overview of the transformation
8575 : : is described below. Suppose we have the following loop.
8576 : :
8577 : : int t = 0;
8578 : : for (int i = 0; i < n; ++i)
8579 : : {
8580 : : b[i] = a[i] - t;
8581 : : t = a[i];
8582 : : }
8583 : :
8584 : : There is a first-order recurrence on 'a'. For this loop, the scalar IR
8585 : : looks (simplified) like:
8586 : :
8587 : : scalar.preheader:
8588 : : init = 0;
8589 : :
8590 : : scalar.body:
8591 : : i = PHI <0(scalar.preheader), i+1(scalar.body)>
8592 : : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8593 : : _1 = a[i]
8594 : : b[i] = _1 - _2
8595 : : if (i < n) goto scalar.body
8596 : :
8597 : : In this example, _2 is a recurrence because it's value depends on the
8598 : : previous iteration. We vectorize this as (VF = 4)
8599 : :
8600 : : vector.preheader:
8601 : : vect_init = vect_cst(..., ..., ..., 0)
8602 : :
8603 : : vector.body
8604 : : i = PHI <0(vector.preheader), i+4(vector.body)>
8605 : : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8606 : : vect_2 = a[i, i+1, i+2, i+3];
8607 : : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8608 : : b[i, i+1, i+2, i+3] = vect_2 - vect_3
8609 : : if (..) goto vector.body
8610 : :
8611 : : In this function, vectorizable_recurr, we code generate both the
8612 : : vector PHI node and the permute since those together compute the
8613 : : vectorized value of the scalar PHI. We do not yet have the
8614 : : backedge value to fill in there nor into the vec_perm. Those
8615 : : are filled in vect_schedule_scc.
8616 : :
8617 : : TODO: Since the scalar loop does not have a use of the recurrence
8618 : : outside of the loop the natural way to implement peeling via
8619 : : vectorizing the live value doesn't work. For now peeling of loops
8620 : : with a recurrence is not implemented. For SLP the supported cases
8621 : : are restricted to those requiring a single vector recurrence PHI. */
8622 : :
8623 : : bool
8624 : 156200 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8625 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8626 : : {
8627 : 156200 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8628 : : return false;
8629 : :
8630 : 30100 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
8631 : :
8632 : : /* So far we only support first-order recurrence auto-vectorization. */
8633 : 30100 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8634 : : return false;
8635 : :
8636 : 408 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8637 : 408 : unsigned ncopies = vect_get_num_copies (loop_vinfo, slp_node);
8638 : 408 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8639 : 408 : unsigned dist = SLP_TREE_LANES (slp_node);
8640 : : /* We need to be able to make progress with a single vector. */
8641 : 408 : if (maybe_gt (dist * 2, nunits))
8642 : : {
8643 : 0 : if (dump_enabled_p ())
8644 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8645 : : "first order recurrence exceeds half of "
8646 : : "a vector\n");
8647 : 0 : return false;
8648 : : }
8649 : :
8650 : : /* We need to be able to build a { ..., a, b } init vector with
8651 : : dist number of distinct trailing values. Always possible
8652 : : when dist == 1 or when nunits is constant or when the initializations
8653 : : are uniform. */
8654 : 408 : tree uniform_initval = NULL_TREE;
8655 : 408 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8656 : 1656 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8657 : : {
8658 : 444 : gphi *phi = as_a <gphi *> (s->stmt);
8659 : 444 : if (! uniform_initval)
8660 : 408 : uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8661 : 36 : else if (! operand_equal_p (uniform_initval,
8662 : 36 : PHI_ARG_DEF_FROM_EDGE (phi, pe)))
8663 : : {
8664 : : uniform_initval = NULL_TREE;
8665 : : break;
8666 : : }
8667 : : }
8668 : 408 : if (!uniform_initval && !nunits.is_constant ())
8669 : : {
8670 : : if (dump_enabled_p ())
8671 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8672 : : "cannot build initialization vector for "
8673 : : "first order recurrence\n");
8674 : : return false;
8675 : : }
8676 : :
8677 : : /* First-order recurrence autovectorization needs to handle permutation
8678 : : with indices = [nunits-1, nunits, nunits+1, ...]. */
8679 : 408 : vec_perm_builder sel (nunits, 1, 3);
8680 : 1632 : for (int i = 0; i < 3; ++i)
8681 : 1224 : sel.quick_push (nunits - dist + i);
8682 : 408 : vec_perm_indices indices (sel, 2, nunits);
8683 : :
8684 : 408 : if (cost_vec) /* transformation not required. */
8685 : : {
8686 : 368 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8687 : : indices))
8688 : : return false;
8689 : :
8690 : : /* We eventually need to set a vector type on invariant
8691 : : arguments. */
8692 : : unsigned j;
8693 : : slp_tree child;
8694 : 768 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8695 : 512 : if (!vect_maybe_update_slp_op_vectype (child, vectype))
8696 : : {
8697 : 0 : if (dump_enabled_p ())
8698 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8699 : : "incompatible vector types for "
8700 : : "invariants\n");
8701 : 0 : return false;
8702 : : }
8703 : :
8704 : : /* Verify we have set up compatible types. */
8705 : 256 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8706 : 256 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
8707 : 256 : tree latch_vectype = SLP_TREE_VECTYPE (latch_def);
8708 : 256 : if (!types_compatible_p (latch_vectype, vectype))
8709 : : return false;
8710 : :
8711 : : /* The recurrence costs the initialization vector and one permute
8712 : : for each copy. With SLP the prologue value is explicitly
8713 : : represented and costed separately. */
8714 : 256 : unsigned prologue_cost = 0;
8715 : 256 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8716 : : slp_node, 0, vect_body);
8717 : 256 : if (dump_enabled_p ())
8718 : 48 : dump_printf_loc (MSG_NOTE, vect_location,
8719 : : "vectorizable_recurr: inside_cost = %d, "
8720 : : "prologue_cost = %d .\n", inside_cost,
8721 : : prologue_cost);
8722 : :
8723 : 256 : SLP_TREE_TYPE (slp_node) = recurr_info_type;
8724 : 256 : return true;
8725 : : }
8726 : :
8727 : 40 : tree vec_init;
8728 : 40 : if (! uniform_initval)
8729 : : {
8730 : 6 : vec<constructor_elt, va_gc> *v = NULL;
8731 : 6 : vec_alloc (v, nunits.to_constant ());
8732 : 33 : for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
8733 : 27 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
8734 : : build_zero_cst (TREE_TYPE (vectype)));
8735 : 39 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8736 : : {
8737 : 21 : gphi *phi = as_a <gphi *> (s->stmt);
8738 : 21 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8739 : 21 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
8740 : 21 : TREE_TYPE (preheader)))
8741 : : {
8742 : 0 : gimple_seq stmts = NULL;
8743 : 0 : preheader = gimple_convert (&stmts,
8744 : 0 : TREE_TYPE (vectype), preheader);
8745 : 0 : gsi_insert_seq_on_edge_immediate (pe, stmts);
8746 : : }
8747 : 21 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
8748 : : }
8749 : 6 : vec_init = build_constructor (vectype, v);
8750 : : }
8751 : : else
8752 : : vec_init = uniform_initval;
8753 : 40 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8754 : :
8755 : : /* Create the vectorized first-order PHI node. */
8756 : 40 : tree vec_dest = vect_get_new_vect_var (vectype,
8757 : : vect_simple_var, "vec_recur_");
8758 : 40 : basic_block bb = gimple_bb (phi);
8759 : 40 : gphi *new_phi = create_phi_node (vec_dest, bb);
8760 : 40 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8761 : :
8762 : : /* Insert shuffles the first-order recurrence autovectorization.
8763 : : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8764 : 40 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
8765 : :
8766 : : /* Insert the required permute after the latch definition. The
8767 : : second and later operands are tentative and will be updated when we have
8768 : : vectorized the latch definition. */
8769 : 40 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8770 : 40 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8771 : 40 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8772 : 40 : gsi_next (&gsi2);
8773 : :
8774 : 117 : for (unsigned i = 0; i < ncopies; ++i)
8775 : : {
8776 : 77 : vec_dest = make_ssa_name (vectype);
8777 : 77 : gassign *vperm
8778 : 117 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8779 : 40 : i == 0 ? gimple_phi_result (new_phi) : NULL,
8780 : : NULL, perm);
8781 : 77 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8782 : :
8783 : 77 : slp_node->push_vec_def (vperm);
8784 : : }
8785 : :
8786 : : return true;
8787 : 408 : }
8788 : :
8789 : : /* Return true if VECTYPE represents a vector that requires lowering
8790 : : by the vector lowering pass. */
8791 : :
8792 : : bool
8793 : 635553 : vect_emulated_vector_p (tree vectype)
8794 : : {
8795 : 1271106 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8796 : 638240 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8797 : 2669 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8798 : : }
8799 : :
8800 : : /* Return true if we can emulate CODE on an integer mode representation
8801 : : of a vector. */
8802 : :
8803 : : bool
8804 : 10706 : vect_can_vectorize_without_simd_p (tree_code code)
8805 : : {
8806 : 10706 : switch (code)
8807 : : {
8808 : : case PLUS_EXPR:
8809 : : case MINUS_EXPR:
8810 : : case NEGATE_EXPR:
8811 : : case BIT_AND_EXPR:
8812 : : case BIT_IOR_EXPR:
8813 : : case BIT_XOR_EXPR:
8814 : : case BIT_NOT_EXPR:
8815 : : return true;
8816 : :
8817 : 9945 : default:
8818 : 9945 : return false;
8819 : : }
8820 : : }
8821 : :
8822 : : /* Likewise, but taking a code_helper. */
8823 : :
8824 : : bool
8825 : 155 : vect_can_vectorize_without_simd_p (code_helper code)
8826 : : {
8827 : 155 : return (code.is_tree_code ()
8828 : 155 : && vect_can_vectorize_without_simd_p (tree_code (code)));
8829 : : }
8830 : :
8831 : : /* Create vector init for vectorized iv. */
8832 : : static tree
8833 : 916 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8834 : : tree step_expr, poly_uint64 nunits,
8835 : : tree vectype,
8836 : : enum vect_induction_op_type induction_type)
8837 : : {
8838 : 916 : unsigned HOST_WIDE_INT const_nunits;
8839 : 916 : tree vec_shift, vec_init, new_name;
8840 : 916 : unsigned i;
8841 : 916 : tree itype = TREE_TYPE (vectype);
8842 : :
8843 : : /* iv_loop is the loop to be vectorized. Create:
8844 : : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8845 : 916 : new_name = gimple_convert (stmts, itype, init_expr);
8846 : 916 : switch (induction_type)
8847 : : {
8848 : 18 : case vect_step_op_shr:
8849 : 18 : case vect_step_op_shl:
8850 : : /* Build the Initial value from shift_expr. */
8851 : 18 : vec_init = gimple_build_vector_from_val (stmts,
8852 : : vectype,
8853 : : new_name);
8854 : 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8855 : : build_zero_cst (itype), step_expr);
8856 : 18 : vec_init = gimple_build (stmts,
8857 : : (induction_type == vect_step_op_shr
8858 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
8859 : : vectype, vec_init, vec_shift);
8860 : 18 : break;
8861 : :
8862 : 822 : case vect_step_op_neg:
8863 : 822 : {
8864 : 822 : vec_init = gimple_build_vector_from_val (stmts,
8865 : : vectype,
8866 : : new_name);
8867 : 822 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8868 : : vectype, vec_init);
8869 : : /* The encoding has 2 interleaved stepped patterns. */
8870 : 822 : vec_perm_builder sel (nunits, 2, 3);
8871 : 822 : sel.quick_grow (6);
8872 : 4110 : for (i = 0; i < 3; i++)
8873 : : {
8874 : 2466 : sel[2 * i] = i;
8875 : 2466 : sel[2 * i + 1] = i + nunits;
8876 : : }
8877 : 822 : vec_perm_indices indices (sel, 2, nunits);
8878 : : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8879 : : fail when vec_init is const vector. In that situation vec_perm is not
8880 : : really needed. */
8881 : 822 : tree perm_mask_even
8882 : 822 : = vect_gen_perm_mask_any (vectype, indices);
8883 : 822 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8884 : : vectype,
8885 : : vec_init, vec_neg,
8886 : : perm_mask_even);
8887 : 822 : }
8888 : 822 : break;
8889 : :
8890 : 76 : case vect_step_op_mul:
8891 : 76 : {
8892 : : /* Use unsigned mult to avoid UD integer overflow. */
8893 : 76 : gcc_assert (nunits.is_constant (&const_nunits));
8894 : 76 : tree utype = unsigned_type_for (itype);
8895 : 76 : tree uvectype = build_vector_type (utype,
8896 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
8897 : 76 : new_name = gimple_convert (stmts, utype, new_name);
8898 : 76 : vec_init = gimple_build_vector_from_val (stmts,
8899 : : uvectype,
8900 : : new_name);
8901 : 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
8902 : 76 : tree elt_step = build_one_cst (utype);
8903 : :
8904 : 76 : elts.quick_push (elt_step);
8905 : 660 : for (i = 1; i < const_nunits; i++)
8906 : : {
8907 : : /* Create: new_name_i = new_name + step_expr. */
8908 : 508 : elt_step = gimple_build (stmts, MULT_EXPR,
8909 : : utype, elt_step, step_expr);
8910 : 508 : elts.quick_push (elt_step);
8911 : : }
8912 : : /* Create a vector from [new_name_0, new_name_1, ...,
8913 : : new_name_nunits-1]. */
8914 : 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
8915 : 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8916 : : vec_init, vec_mul);
8917 : 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
8918 : 76 : }
8919 : 76 : break;
8920 : :
8921 : 0 : default:
8922 : 0 : gcc_unreachable ();
8923 : : }
8924 : :
8925 : 916 : return vec_init;
8926 : : }
8927 : :
8928 : : /* Peel init_expr by skip_niter for induction_type. */
8929 : : tree
8930 : 84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8931 : : tree skip_niters, tree step_expr,
8932 : : enum vect_induction_op_type induction_type)
8933 : : {
8934 : 84 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
8935 : 84 : tree type = TREE_TYPE (init_expr);
8936 : 84 : unsigned prec = TYPE_PRECISION (type);
8937 : 84 : switch (induction_type)
8938 : : {
8939 : 0 : case vect_step_op_neg:
8940 : 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
8941 : 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
8942 : : /* else no change. */
8943 : : break;
8944 : :
8945 : 12 : case vect_step_op_shr:
8946 : 12 : case vect_step_op_shl:
8947 : 12 : skip_niters = gimple_convert (stmts, type, skip_niters);
8948 : 12 : step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
8949 : : /* When shift mount >= precision, need to avoid UD.
8950 : : In the original loop, there's no UD, and according to semantic,
8951 : : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
8952 : 12 : if (!tree_fits_uhwi_p (step_expr)
8953 : 12 : || tree_to_uhwi (step_expr) >= prec)
8954 : : {
8955 : 6 : if (induction_type == vect_step_op_shl
8956 : 6 : || TYPE_UNSIGNED (type))
8957 : 4 : init_expr = build_zero_cst (type);
8958 : : else
8959 : 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
8960 : : init_expr,
8961 : 4 : wide_int_to_tree (type, prec - 1));
8962 : : }
8963 : : else
8964 : 8 : init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
8965 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
8966 : : type, init_expr, step_expr);
8967 : : break;
8968 : :
8969 : 72 : case vect_step_op_mul:
8970 : 72 : {
8971 : 72 : tree utype = unsigned_type_for (type);
8972 : 72 : init_expr = gimple_convert (stmts, utype, init_expr);
8973 : 72 : wide_int skipn = wi::to_wide (skip_niters);
8974 : 72 : wide_int begin = wi::to_wide (step_expr);
8975 : 72 : auto_mpz base, exp, mod, res;
8976 : 72 : wi::to_mpz (begin, base, TYPE_SIGN (type));
8977 : 72 : wi::to_mpz (skipn, exp, UNSIGNED);
8978 : 72 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
8979 : 72 : mpz_powm (res, base, exp, mod);
8980 : 72 : begin = wi::from_mpz (utype, res, true);
8981 : 72 : tree mult_expr = wide_int_to_tree (utype, begin);
8982 : 72 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
8983 : : init_expr, mult_expr);
8984 : 72 : init_expr = gimple_convert (stmts, type, init_expr);
8985 : 72 : }
8986 : 72 : break;
8987 : :
8988 : 0 : default:
8989 : 0 : gcc_unreachable ();
8990 : : }
8991 : :
8992 : 84 : return init_expr;
8993 : : }
8994 : :
8995 : : /* Create vector step for vectorized iv. */
8996 : : static tree
8997 : 1202 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
8998 : : poly_uint64 vf,
8999 : : enum vect_induction_op_type induction_type)
9000 : : {
9001 : 1202 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9002 : 1202 : tree new_name = NULL;
9003 : : /* Step should be pow (step, vf) for mult induction. */
9004 : 1202 : if (induction_type == vect_step_op_mul)
9005 : : {
9006 : 76 : gcc_assert (vf.is_constant ());
9007 : 76 : wide_int begin = wi::to_wide (step_expr);
9008 : :
9009 : 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9010 : 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
9011 : :
9012 : 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9013 : 76 : }
9014 : 1126 : else if (induction_type == vect_step_op_neg)
9015 : : /* Do nothing. */
9016 : : ;
9017 : : else
9018 : 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9019 : : expr, step_expr);
9020 : 1202 : return new_name;
9021 : : }
9022 : :
9023 : : static tree
9024 : 1202 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9025 : : stmt_vec_info stmt_info,
9026 : : tree new_name, tree vectype,
9027 : : enum vect_induction_op_type induction_type)
9028 : : {
9029 : : /* No step is needed for neg induction. */
9030 : 1202 : if (induction_type == vect_step_op_neg)
9031 : : return NULL;
9032 : :
9033 : 94 : tree t = unshare_expr (new_name);
9034 : 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
9035 : : || TREE_CODE (new_name) == SSA_NAME);
9036 : 94 : tree new_vec = build_vector_from_val (vectype, t);
9037 : 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9038 : : new_vec, vectype, NULL);
9039 : 94 : return vec_step;
9040 : : }
9041 : :
9042 : : /* Update vectorized iv with vect_step, induc_def is init. */
9043 : : static tree
9044 : 1390 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9045 : : tree induc_def, tree vec_step,
9046 : : enum vect_induction_op_type induction_type)
9047 : : {
9048 : 1390 : tree vec_def = induc_def;
9049 : 1390 : switch (induction_type)
9050 : : {
9051 : 76 : case vect_step_op_mul:
9052 : 76 : {
9053 : : /* Use unsigned mult to avoid UD integer overflow. */
9054 : 76 : tree uvectype
9055 : 76 : = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9056 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
9057 : 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
9058 : 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
9059 : 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9060 : : vec_def, vec_step);
9061 : 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
9062 : : }
9063 : 76 : break;
9064 : :
9065 : 12 : case vect_step_op_shr:
9066 : 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9067 : : vec_def, vec_step);
9068 : 12 : break;
9069 : :
9070 : 6 : case vect_step_op_shl:
9071 : 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9072 : : vec_def, vec_step);
9073 : 6 : break;
9074 : : case vect_step_op_neg:
9075 : : vec_def = induc_def;
9076 : : /* Do nothing. */
9077 : : break;
9078 : 0 : default:
9079 : 0 : gcc_unreachable ();
9080 : : }
9081 : :
9082 : 1390 : return vec_def;
9083 : :
9084 : : }
9085 : :
9086 : : /* Function vectorizable_nonlinear_induction
9087 : :
9088 : : Check if STMT_INFO performs an nonlinear induction computation that can be
9089 : : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9090 : : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9091 : : basic block.
9092 : : Return true if STMT_INFO is vectorizable in this way. */
9093 : :
9094 : : static bool
9095 : 10412 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9096 : : stmt_vec_info stmt_info,
9097 : : slp_tree slp_node,
9098 : : stmt_vector_for_cost *cost_vec)
9099 : : {
9100 : 10412 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9101 : 10412 : unsigned ncopies;
9102 : 10412 : bool nested_in_vect_loop = false;
9103 : 10412 : class loop *iv_loop;
9104 : 10412 : tree vec_def;
9105 : 10412 : edge pe = loop_preheader_edge (loop);
9106 : 10412 : basic_block new_bb;
9107 : 10412 : tree vec_init, vec_step;
9108 : 10412 : tree new_name;
9109 : 10412 : gimple *new_stmt;
9110 : 10412 : gphi *induction_phi;
9111 : 10412 : tree induc_def, vec_dest;
9112 : 10412 : tree init_expr, step_expr;
9113 : 10412 : tree niters_skip;
9114 : 10412 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9115 : 10412 : unsigned i;
9116 : 10412 : gimple_stmt_iterator si;
9117 : :
9118 : 10412 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9119 : :
9120 : 10412 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9121 : 10412 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9122 : 10412 : enum vect_induction_op_type induction_type
9123 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9124 : :
9125 : 10412 : gcc_assert (induction_type > vect_step_op_add);
9126 : :
9127 : 10412 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
9128 : 10412 : gcc_assert (ncopies >= 1);
9129 : :
9130 : : /* FORNOW. Only handle nonlinear induction in the same loop. */
9131 : 10412 : if (nested_in_vect_loop_p (loop, stmt_info))
9132 : : {
9133 : 0 : if (dump_enabled_p ())
9134 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9135 : : "nonlinear induction in nested loop.\n");
9136 : 0 : return false;
9137 : : }
9138 : :
9139 : 10412 : iv_loop = loop;
9140 : 10412 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9141 : :
9142 : : /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
9143 : : vector iv update for each iv and a permutation to generate wanted
9144 : : vector iv. */
9145 : 10412 : if (SLP_TREE_LANES (slp_node) > 1)
9146 : : {
9147 : 0 : if (dump_enabled_p ())
9148 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9149 : : "SLP induction not supported for nonlinear"
9150 : : " induction.\n");
9151 : 0 : return false;
9152 : : }
9153 : :
9154 : 10412 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9155 : : {
9156 : 0 : if (dump_enabled_p ())
9157 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9158 : : "floating point nonlinear induction vectorization"
9159 : : " not supported.\n");
9160 : 0 : return false;
9161 : : }
9162 : :
9163 : 10412 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9164 : 10412 : init_expr = vect_phi_initial_value (phi);
9165 : 10412 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9166 : : && TREE_CODE (step_expr) == INTEGER_CST);
9167 : : /* step_expr should be aligned with init_expr,
9168 : : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9169 : 10412 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9170 : :
9171 : 10412 : if (TREE_CODE (init_expr) == INTEGER_CST)
9172 : 2837 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9173 : 7575 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9174 : : {
9175 : : /* INIT_EXPR could be a bit_field, bail out for such case. */
9176 : 4 : if (dump_enabled_p ())
9177 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9178 : : "nonlinear induction vectorization failed:"
9179 : : " component type of vectype is not a nop conversion"
9180 : : " from type of init_expr.\n");
9181 : 4 : return false;
9182 : : }
9183 : :
9184 : 10408 : switch (induction_type)
9185 : : {
9186 : 2538 : case vect_step_op_neg:
9187 : 2538 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9188 : : return false;
9189 : 2534 : if (TREE_CODE (init_expr) != INTEGER_CST
9190 : 190 : && TREE_CODE (init_expr) != REAL_CST)
9191 : : {
9192 : : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9193 : 190 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9194 : 0 : return false;
9195 : :
9196 : : /* The encoding has 2 interleaved stepped patterns. */
9197 : 190 : vec_perm_builder sel (nunits, 2, 3);
9198 : 190 : machine_mode mode = TYPE_MODE (vectype);
9199 : 190 : sel.quick_grow (6);
9200 : 950 : for (i = 0; i < 3; i++)
9201 : : {
9202 : 570 : sel[i * 2] = i;
9203 : 570 : sel[i * 2 + 1] = i + nunits;
9204 : : }
9205 : 190 : vec_perm_indices indices (sel, 2, nunits);
9206 : 190 : if (!can_vec_perm_const_p (mode, mode, indices))
9207 : 0 : return false;
9208 : 190 : }
9209 : : break;
9210 : :
9211 : 744 : case vect_step_op_mul:
9212 : 744 : {
9213 : : /* Check for backend support of MULT_EXPR. */
9214 : 744 : if (!directly_supported_p (MULT_EXPR, vectype))
9215 : : return false;
9216 : :
9217 : : /* ?? How to construct vector step for variable number vector.
9218 : : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9219 : : if (!vf.is_constant ())
9220 : : return false;
9221 : : }
9222 : : break;
9223 : :
9224 : 7022 : case vect_step_op_shr:
9225 : : /* Check for backend support of RSHIFT_EXPR. */
9226 : 7022 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9227 : : return false;
9228 : :
9229 : : /* Don't shift more than type precision to avoid UD. */
9230 : 26 : if (!tree_fits_uhwi_p (step_expr)
9231 : 26 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9232 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9233 : : return false;
9234 : : break;
9235 : :
9236 : 104 : case vect_step_op_shl:
9237 : : /* Check for backend support of RSHIFT_EXPR. */
9238 : 104 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9239 : : return false;
9240 : :
9241 : : /* Don't shift more than type precision to avoid UD. */
9242 : 12 : if (!tree_fits_uhwi_p (step_expr)
9243 : 12 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9244 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9245 : : return false;
9246 : :
9247 : : break;
9248 : :
9249 : 0 : default:
9250 : 0 : gcc_unreachable ();
9251 : : }
9252 : :
9253 : 3152 : if (cost_vec) /* transformation not required. */
9254 : : {
9255 : 2236 : unsigned inside_cost = 0, prologue_cost = 0;
9256 : : /* loop cost for vec_loop. Neg induction doesn't have any
9257 : : inside_cost. */
9258 : 2236 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9259 : : slp_node, 0, vect_body);
9260 : :
9261 : : /* loop cost for vec_loop. Neg induction doesn't have any
9262 : : inside_cost. */
9263 : 2236 : if (induction_type == vect_step_op_neg)
9264 : 1712 : inside_cost = 0;
9265 : :
9266 : : /* prologue cost for vec_init and vec_step. */
9267 : 2236 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9268 : : slp_node, 0, vect_prologue);
9269 : :
9270 : 2236 : if (dump_enabled_p ())
9271 : 60 : dump_printf_loc (MSG_NOTE, vect_location,
9272 : : "vect_model_induction_cost: inside_cost = %d, "
9273 : : "prologue_cost = %d. \n", inside_cost,
9274 : : prologue_cost);
9275 : :
9276 : 2236 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9277 : 2236 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9278 : 2236 : return true;
9279 : : }
9280 : :
9281 : : /* Transform. */
9282 : :
9283 : : /* Compute a vector variable, initialized with the first VF values of
9284 : : the induction variable. E.g., for an iv with IV_PHI='X' and
9285 : : evolution S, for a vector of 4 units, we want to compute:
9286 : : [X, X + S, X + 2*S, X + 3*S]. */
9287 : :
9288 : 916 : if (dump_enabled_p ())
9289 : 32 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9290 : :
9291 : 916 : pe = loop_preheader_edge (iv_loop);
9292 : : /* Find the first insertion point in the BB. */
9293 : 916 : basic_block bb = gimple_bb (phi);
9294 : 916 : si = gsi_after_labels (bb);
9295 : :
9296 : 916 : gimple_seq stmts = NULL;
9297 : :
9298 : 916 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9299 : : /* If we are using the loop mask to "peel" for alignment then we need
9300 : : to adjust the start value here. */
9301 : 916 : if (niters_skip != NULL_TREE)
9302 : 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9303 : : step_expr, induction_type);
9304 : :
9305 : 916 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9306 : : step_expr, nunits, vectype,
9307 : : induction_type);
9308 : 916 : if (stmts)
9309 : : {
9310 : 162 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9311 : 162 : gcc_assert (!new_bb);
9312 : : }
9313 : :
9314 : 916 : stmts = NULL;
9315 : 916 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9316 : : vf, induction_type);
9317 : 916 : if (stmts)
9318 : : {
9319 : 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9320 : 0 : gcc_assert (!new_bb);
9321 : : }
9322 : :
9323 : 916 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9324 : : new_name, vectype,
9325 : : induction_type);
9326 : : /* Create the following def-use cycle:
9327 : : loop prolog:
9328 : : vec_init = ...
9329 : : vec_step = ...
9330 : : loop:
9331 : : vec_iv = PHI <vec_init, vec_loop>
9332 : : ...
9333 : : STMT
9334 : : ...
9335 : : vec_loop = vec_iv + vec_step; */
9336 : :
9337 : : /* Create the induction-phi that defines the induction-operand. */
9338 : 916 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9339 : 916 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9340 : 916 : induc_def = PHI_RESULT (induction_phi);
9341 : :
9342 : : /* Create the iv update inside the loop. */
9343 : 916 : stmts = NULL;
9344 : 916 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9345 : : induc_def, vec_step,
9346 : : induction_type);
9347 : :
9348 : 916 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9349 : 916 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9350 : :
9351 : : /* Set the arguments of the phi node: */
9352 : 916 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9353 : 916 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9354 : : UNKNOWN_LOCATION);
9355 : :
9356 : 916 : slp_node->push_vec_def (induction_phi);
9357 : :
9358 : : /* In case that vectorization factor (VF) is bigger than the number
9359 : : of elements that we can fit in a vectype (nunits), we have to generate
9360 : : more than one vector stmt - i.e - we need to "unroll" the
9361 : : vector stmt by a factor VF/nunits. For more details see documentation
9362 : : in vectorizable_operation. */
9363 : :
9364 : 916 : if (ncopies > 1)
9365 : : {
9366 : 286 : stmts = NULL;
9367 : : /* FORNOW. This restriction should be relaxed. */
9368 : 286 : gcc_assert (!nested_in_vect_loop);
9369 : :
9370 : 286 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9371 : : nunits, induction_type);
9372 : :
9373 : 286 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9374 : : new_name, vectype,
9375 : : induction_type);
9376 : 286 : vec_def = induc_def;
9377 : 1046 : for (i = 1; i < ncopies; i++)
9378 : : {
9379 : : /* vec_i = vec_prev + vec_step. */
9380 : 474 : stmts = NULL;
9381 : 474 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9382 : : vec_def, vec_step,
9383 : : induction_type);
9384 : 474 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9385 : 474 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9386 : 474 : slp_node->push_vec_def (new_stmt);
9387 : : }
9388 : : }
9389 : :
9390 : 916 : if (dump_enabled_p ())
9391 : 64 : dump_printf_loc (MSG_NOTE, vect_location,
9392 : : "transform induction: created def-use cycle: %G%G",
9393 : 32 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9394 : :
9395 : : return true;
9396 : : }
9397 : :
9398 : : /* Function vectorizable_induction
9399 : :
9400 : : Check if STMT_INFO performs an induction computation that can be vectorized.
9401 : : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9402 : : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9403 : : Return true if STMT_INFO is vectorizable in this way. */
9404 : :
9405 : : bool
9406 : 269405 : vectorizable_induction (loop_vec_info loop_vinfo,
9407 : : stmt_vec_info stmt_info,
9408 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9409 : : {
9410 : 269405 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9411 : 269405 : bool nested_in_vect_loop = false;
9412 : 269405 : class loop *iv_loop;
9413 : 269405 : tree vec_def;
9414 : 269405 : edge pe = loop_preheader_edge (loop);
9415 : 269405 : basic_block new_bb;
9416 : 269405 : tree vec_init = NULL_TREE, vec_step, t;
9417 : 269405 : tree new_name;
9418 : 269405 : gphi *induction_phi;
9419 : 269405 : tree induc_def, vec_dest;
9420 : 269405 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9421 : 269405 : unsigned i;
9422 : 269405 : tree expr;
9423 : 269405 : tree index_vectype = NULL_TREE;
9424 : 269405 : gimple_stmt_iterator si;
9425 : 269405 : enum vect_induction_op_type induction_type
9426 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9427 : :
9428 : 292909 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9429 : 143305 : if (!phi)
9430 : : return false;
9431 : :
9432 : 143305 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
9433 : : return false;
9434 : :
9435 : : /* Make sure it was recognized as induction computation. */
9436 : 143305 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9437 : : return false;
9438 : :
9439 : : /* Handle nonlinear induction in a separate place. */
9440 : 139755 : if (induction_type != vect_step_op_add)
9441 : 10412 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9442 : 10412 : slp_node, cost_vec);
9443 : :
9444 : 129343 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9445 : 129343 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9446 : :
9447 : : /* FORNOW. These restrictions should be relaxed. */
9448 : 129343 : if (nested_in_vect_loop_p (loop, stmt_info))
9449 : : {
9450 : 602 : imm_use_iterator imm_iter;
9451 : 602 : use_operand_p use_p;
9452 : 602 : gimple *exit_phi;
9453 : 602 : edge latch_e;
9454 : 602 : tree loop_arg;
9455 : :
9456 : 602 : exit_phi = NULL;
9457 : 602 : latch_e = loop_latch_edge (loop->inner);
9458 : 602 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9459 : 1848 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9460 : : {
9461 : 654 : gimple *use_stmt = USE_STMT (use_p);
9462 : 654 : if (is_gimple_debug (use_stmt))
9463 : 36 : continue;
9464 : :
9465 : 618 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9466 : : {
9467 : : exit_phi = use_stmt;
9468 : : break;
9469 : : }
9470 : 602 : }
9471 : 602 : if (exit_phi)
9472 : : {
9473 : 10 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9474 : 10 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9475 : 6 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9476 : : {
9477 : 4 : if (dump_enabled_p ())
9478 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9479 : : "inner-loop induction only used outside "
9480 : : "of the outer vectorized loop.\n");
9481 : 4 : return false;
9482 : : }
9483 : : }
9484 : :
9485 : 598 : nested_in_vect_loop = true;
9486 : 598 : iv_loop = loop->inner;
9487 : : }
9488 : : else
9489 : : iv_loop = loop;
9490 : 129339 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9491 : :
9492 : 129339 : if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
9493 : : {
9494 : : /* The current SLP code creates the step value element-by-element. */
9495 : : if (dump_enabled_p ())
9496 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9497 : : "SLP induction not supported for variable-length"
9498 : : " vectors.\n");
9499 : : return false;
9500 : : }
9501 : :
9502 : 129339 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9503 : : {
9504 : 12 : if (dump_enabled_p ())
9505 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9506 : : "floating point induction vectorization disabled\n");
9507 : 12 : return false;
9508 : : }
9509 : :
9510 : 129327 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9511 : 129327 : gcc_assert (step_expr != NULL_TREE);
9512 : 258608 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
9513 : 258516 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
9514 : : {
9515 : 12 : if (dump_enabled_p ())
9516 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9517 : : "bit-precision induction vectorization not "
9518 : : "supported.\n");
9519 : 12 : return false;
9520 : : }
9521 : 129315 : tree stept = TREE_TYPE (step_expr);
9522 : 129315 : tree step_vectype = get_same_sized_vectype (stept, vectype);
9523 : 129315 : stept = TREE_TYPE (step_vectype);
9524 : :
9525 : : /* Check for target support of the vectorized arithmetic used here. */
9526 : 129315 : if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
9527 : 129315 : || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
9528 : 19926 : return false;
9529 : 109389 : if (!nunits.is_constant ())
9530 : : {
9531 : : if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
9532 : : return false;
9533 : : /* FLOAT_EXPR when computing VEC_INIT for float inductions. */
9534 : : if (SCALAR_FLOAT_TYPE_P (stept))
9535 : : {
9536 : : tree index_type = build_nonstandard_integer_type
9537 : : (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
9538 : :
9539 : : index_vectype = build_vector_type (index_type, nunits);
9540 : : if (!can_float_p (TYPE_MODE (step_vectype),
9541 : : TYPE_MODE (index_vectype), 1))
9542 : : return false;
9543 : : }
9544 : : }
9545 : :
9546 : 109389 : unsigned nvects = vect_get_num_copies (loop_vinfo, slp_node);
9547 : 109389 : if (cost_vec) /* transformation not required. */
9548 : : {
9549 : 276963 : unsigned inside_cost = 0, prologue_cost = 0;
9550 : : /* We eventually need to set a vector type on invariant
9551 : : arguments. */
9552 : : unsigned j;
9553 : : slp_tree child;
9554 : 276963 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9555 : 184642 : if (!vect_maybe_update_slp_op_vectype
9556 : 184642 : (child, SLP_TREE_VECTYPE (slp_node)))
9557 : : {
9558 : 0 : if (dump_enabled_p ())
9559 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9560 : : "incompatible vector types for "
9561 : : "invariants\n");
9562 : 0 : return false;
9563 : : }
9564 : : /* loop cost for vec_loop. */
9565 : 92321 : inside_cost = record_stmt_cost (cost_vec, nvects,
9566 : : vector_stmt, slp_node, 0, vect_body);
9567 : : /* prologue cost for vec_init (if not nested) and step. */
9568 : 92321 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9569 : : scalar_to_vec,
9570 : : slp_node, 0, vect_prologue);
9571 : 92321 : if (dump_enabled_p ())
9572 : 3954 : dump_printf_loc (MSG_NOTE, vect_location,
9573 : : "vect_model_induction_cost: inside_cost = %d, "
9574 : : "prologue_cost = %d .\n", inside_cost,
9575 : : prologue_cost);
9576 : :
9577 : 92321 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9578 : 92321 : DUMP_VECT_SCOPE ("vectorizable_induction");
9579 : 92321 : return true;
9580 : : }
9581 : :
9582 : : /* Transform. */
9583 : :
9584 : : /* Compute a vector variable, initialized with the first VF values of
9585 : : the induction variable. E.g., for an iv with IV_PHI='X' and
9586 : : evolution S, for a vector of 4 units, we want to compute:
9587 : : [X, X + S, X + 2*S, X + 3*S]. */
9588 : :
9589 : 17068 : if (dump_enabled_p ())
9590 : 2835 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9591 : :
9592 : 17068 : pe = loop_preheader_edge (iv_loop);
9593 : : /* Find the first insertion point in the BB. */
9594 : 17068 : basic_block bb = gimple_bb (phi);
9595 : 17068 : si = gsi_after_labels (bb);
9596 : :
9597 : : /* For SLP induction we have to generate several IVs as for example
9598 : : with group size 3 we need
9599 : : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9600 : : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9601 : 17068 : gimple_stmt_iterator incr_si;
9602 : 17068 : bool insert_after;
9603 : 17068 : standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
9604 : :
9605 : : /* The initial values are vectorized, but any lanes > group_size
9606 : : need adjustment. */
9607 : 17068 : slp_tree init_node
9608 : 17068 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9609 : :
9610 : : /* Gather steps. Since we do not vectorize inductions as
9611 : : cycles we have to reconstruct the step from SCEV data. */
9612 : 17068 : unsigned group_size = SLP_TREE_LANES (slp_node);
9613 : 17068 : tree *steps = XALLOCAVEC (tree, group_size);
9614 : 17068 : tree *inits = XALLOCAVEC (tree, group_size);
9615 : 17068 : stmt_vec_info phi_info;
9616 : 52409 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9617 : : {
9618 : 18273 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9619 : 18273 : if (!init_node)
9620 : 18097 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9621 : : pe->dest_idx);
9622 : : }
9623 : :
9624 : : /* Now generate the IVs. */
9625 : 34136 : gcc_assert (multiple_p (nunits * nvects, group_size));
9626 : 17068 : unsigned nivs;
9627 : 17068 : unsigned HOST_WIDE_INT const_nunits;
9628 : 17068 : if (nested_in_vect_loop)
9629 : : nivs = nvects;
9630 : 16906 : else if (nunits.is_constant (&const_nunits))
9631 : : {
9632 : : /* Compute the number of distinct IVs we need. First reduce
9633 : : group_size if it is a multiple of const_nunits so we get
9634 : : one IV for a group_size of 4 but const_nunits 2. */
9635 : 16906 : unsigned group_sizep = group_size;
9636 : 16906 : if (group_sizep % const_nunits == 0)
9637 : 109 : group_sizep = group_sizep / const_nunits;
9638 : 16906 : nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
9639 : : }
9640 : : else
9641 : : {
9642 : : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9643 : : nivs = 1;
9644 : : }
9645 : 17068 : gimple_seq init_stmts = NULL;
9646 : 17068 : tree lupdate_mul = NULL_TREE;
9647 : 162 : if (!nested_in_vect_loop)
9648 : : {
9649 : 16906 : if (nunits.is_constant (&const_nunits))
9650 : : {
9651 : : /* The number of iterations covered in one vector iteration. */
9652 : 16906 : unsigned lup_mul = (nvects * const_nunits) / group_size;
9653 : 16906 : lupdate_mul
9654 : 16906 : = build_vector_from_val (step_vectype,
9655 : 16906 : SCALAR_FLOAT_TYPE_P (stept)
9656 : 27 : ? build_real_from_wide (stept, lup_mul,
9657 : : UNSIGNED)
9658 : 33785 : : build_int_cstu (stept, lup_mul));
9659 : : }
9660 : : else
9661 : : {
9662 : : if (SCALAR_FLOAT_TYPE_P (stept))
9663 : : {
9664 : : tree tem = build_int_cst (integer_type_node, vf);
9665 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9666 : : }
9667 : : else
9668 : : lupdate_mul = build_int_cst (stept, vf);
9669 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9670 : : lupdate_mul);
9671 : : }
9672 : : }
9673 : 17068 : tree peel_mul = NULL_TREE;
9674 : 17068 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9675 : : {
9676 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9677 : 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9678 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9679 : : else
9680 : 0 : peel_mul = gimple_convert (&init_stmts, stept,
9681 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9682 : 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
9683 : : step_vectype, peel_mul);
9684 : :
9685 : : /* If early break then we have to create a new PHI which we can use as
9686 : : an offset to adjust the induction reduction in early exits.
9687 : :
9688 : : This is because when peeling for alignment using masking, the first
9689 : : few elements of the vector can be inactive. As such if we find the
9690 : : entry in the first iteration we have adjust the starting point of
9691 : : the scalar code.
9692 : :
9693 : : We do this by creating a new scalar PHI that keeps track of whether
9694 : : we are the first iteration of the loop (with the additional masking)
9695 : : or whether we have taken a loop iteration already.
9696 : :
9697 : : The generated sequence:
9698 : :
9699 : : pre-header:
9700 : : bb1:
9701 : : i_1 = <number of leading inactive elements>
9702 : :
9703 : : header:
9704 : : bb2:
9705 : : i_2 = PHI <i_1(bb1), 0(latch)>
9706 : : …
9707 : :
9708 : : early-exit:
9709 : : bb3:
9710 : : i_3 = iv_step * i_2 + PHI<vector-iv>
9711 : :
9712 : : The first part of the adjustment to create i_1 and i_2 are done here
9713 : : and the last part creating i_3 is done in
9714 : : vectorizable_live_operations when the induction extraction is
9715 : : materialized. */
9716 : 0 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
9717 : 0 : && !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
9718 : : {
9719 : 0 : auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9720 : 0 : tree ty_skip_niters = TREE_TYPE (skip_niters);
9721 : 0 : tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters,
9722 : : vect_scalar_var,
9723 : : "pfa_iv_offset");
9724 : 0 : gphi *nphi = create_phi_node (break_lhs_phi, bb);
9725 : 0 : add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION);
9726 : 0 : add_phi_arg (nphi, build_zero_cst (ty_skip_niters),
9727 : : loop_latch_edge (iv_loop), UNKNOWN_LOCATION);
9728 : :
9729 : 0 : LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo) = PHI_RESULT (nphi);
9730 : : }
9731 : : }
9732 : 17068 : tree step_mul = NULL_TREE;
9733 : 17068 : unsigned ivn;
9734 : 17068 : auto_vec<tree> vec_steps;
9735 : 34702 : for (ivn = 0; ivn < nivs; ++ivn)
9736 : : {
9737 : 17634 : gimple_seq stmts = NULL;
9738 : 17634 : bool invariant = true;
9739 : 17634 : if (nunits.is_constant (&const_nunits))
9740 : : {
9741 : 17634 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9742 : 17634 : tree_vector_builder init_elts (vectype, const_nunits, 1);
9743 : 17634 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9744 : 117800 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9745 : : {
9746 : : /* The scalar steps of the IVs. */
9747 : 100166 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9748 : 100166 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9749 : 100166 : step_elts.quick_push (elt);
9750 : 100166 : if (!init_node)
9751 : : {
9752 : : /* The scalar inits of the IVs if not vectorized. */
9753 : 99204 : elt = inits[(ivn*const_nunits + eltn) % group_size];
9754 : 99204 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
9755 : 99204 : TREE_TYPE (elt)))
9756 : 266 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9757 : 266 : TREE_TYPE (vectype), elt);
9758 : 99204 : init_elts.quick_push (elt);
9759 : : }
9760 : : /* The number of steps to add to the initial values. */
9761 : 100166 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9762 : 200332 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9763 : 200234 : ? build_real_from_wide (stept, mul_elt,
9764 : : UNSIGNED)
9765 : 200234 : : build_int_cstu (stept, mul_elt));
9766 : : }
9767 : 17634 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
9768 : 17634 : step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9769 : 17634 : if (!init_node)
9770 : 17440 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
9771 : 17634 : }
9772 : : else
9773 : : {
9774 : : if (init_node)
9775 : : ;
9776 : : else if (INTEGRAL_TYPE_P (TREE_TYPE (steps[0])))
9777 : : {
9778 : : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9779 : : /* Build the initial value directly as a VEC_SERIES_EXPR. */
9780 : : vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
9781 : : step_vectype, new_name, steps[0]);
9782 : : if (!useless_type_conversion_p (vectype, step_vectype))
9783 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9784 : : vectype, vec_init);
9785 : : }
9786 : : else
9787 : : {
9788 : : /* Build:
9789 : : [base, base, base, ...]
9790 : : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9791 : : gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (steps[0])));
9792 : : gcc_assert (flag_associative_math);
9793 : : gcc_assert (index_vectype != NULL_TREE);
9794 : :
9795 : : tree index = build_index_vector (index_vectype, 0, 1);
9796 : : new_name = gimple_convert (&init_stmts, TREE_TYPE (steps[0]),
9797 : : inits[0]);
9798 : : tree base_vec = gimple_build_vector_from_val (&init_stmts,
9799 : : step_vectype,
9800 : : new_name);
9801 : : tree step_vec = gimple_build_vector_from_val (&init_stmts,
9802 : : step_vectype,
9803 : : steps[0]);
9804 : : vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
9805 : : step_vectype, index);
9806 : : vec_init = gimple_build (&init_stmts, MULT_EXPR,
9807 : : step_vectype, vec_init, step_vec);
9808 : : vec_init = gimple_build (&init_stmts, PLUS_EXPR,
9809 : : step_vectype, vec_init, base_vec);
9810 : : if (!useless_type_conversion_p (vectype, step_vectype))
9811 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9812 : : vectype, vec_init);
9813 : : }
9814 : : /* iv_loop is nested in the loop to be vectorized. Generate:
9815 : : vec_step = [S, S, S, S] */
9816 : : t = unshare_expr (steps[0]);
9817 : : gcc_assert (CONSTANT_CLASS_P (t)
9818 : : || TREE_CODE (t) == SSA_NAME);
9819 : : vec_step = gimple_build_vector_from_val (&init_stmts,
9820 : : step_vectype, t);
9821 : : }
9822 : 17634 : vec_steps.safe_push (vec_step);
9823 : 17634 : if (peel_mul)
9824 : : {
9825 : 0 : if (!step_mul)
9826 : : {
9827 : 0 : gcc_assert (!nunits.is_constant ());
9828 : : step_mul = gimple_build (&init_stmts,
9829 : : MINUS_EXPR, step_vectype,
9830 : : build_zero_cst (step_vectype), peel_mul);
9831 : : }
9832 : : else
9833 : 0 : step_mul = gimple_build (&init_stmts,
9834 : : MINUS_EXPR, step_vectype,
9835 : : step_mul, peel_mul);
9836 : : }
9837 : :
9838 : : /* Create the induction-phi that defines the induction-operand. */
9839 : 17634 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9840 : : "vec_iv_");
9841 : 17634 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9842 : 17634 : induc_def = PHI_RESULT (induction_phi);
9843 : :
9844 : : /* Create the iv update inside the loop */
9845 : 17634 : tree up = vec_step;
9846 : 17634 : if (lupdate_mul)
9847 : : {
9848 : 17440 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
9849 : : {
9850 : : /* When we're using loop_len produced by SELEC_VL, the
9851 : : non-final iterations are not always processing VF
9852 : : elements. So vectorize induction variable instead of
9853 : :
9854 : : _21 = vect_vec_iv_.6_22 + { VF, ... };
9855 : :
9856 : : We should generate:
9857 : :
9858 : : _35 = .SELECT_VL (ivtmp_33, VF);
9859 : : vect_cst__22 = [vec_duplicate_expr] _35;
9860 : : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
9861 : 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
9862 : 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
9863 : : vectype, 0, 0);
9864 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9865 : 0 : expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
9866 : : else
9867 : 0 : expr = gimple_convert (&stmts, stept, len);
9868 : 0 : lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
9869 : : expr);
9870 : 0 : up = gimple_build (&stmts, MULT_EXPR,
9871 : : step_vectype, vec_step, lupdate_mul);
9872 : : }
9873 : : else
9874 : 17440 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9875 : : vec_step, lupdate_mul);
9876 : : }
9877 : 17634 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9878 : 17634 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
9879 : 17634 : vec_def = gimple_convert (&stmts, vectype, vec_def);
9880 : 17634 : insert_iv_increment (&incr_si, insert_after, stmts);
9881 : 17634 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9882 : : UNKNOWN_LOCATION);
9883 : :
9884 : 17634 : if (init_node)
9885 : 194 : vec_init = vect_get_slp_vect_def (init_node, ivn);
9886 : 17634 : if (!nested_in_vect_loop
9887 : 17634 : && step_mul
9888 : 17634 : && !integer_zerop (step_mul))
9889 : : {
9890 : 17006 : gcc_assert (invariant);
9891 : 17006 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9892 : 17006 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9893 : : vec_step, step_mul);
9894 : 17006 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9895 : : vec_def, up);
9896 : 17006 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9897 : : }
9898 : :
9899 : : /* Set the arguments of the phi node: */
9900 : 17634 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9901 : :
9902 : 17634 : slp_node->push_vec_def (induction_phi);
9903 : : }
9904 : 17068 : if (!nested_in_vect_loop)
9905 : : {
9906 : : /* Fill up to the number of vectors we need for the whole group. */
9907 : 16906 : if (nunits.is_constant (&const_nunits))
9908 : 16906 : nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
9909 : : else
9910 : : nivs = 1;
9911 : 16906 : vec_steps.reserve (nivs-ivn);
9912 : 33833 : for (; ivn < nivs; ++ivn)
9913 : : {
9914 : 21 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9915 : 21 : vec_steps.quick_push (vec_steps[0]);
9916 : : }
9917 : : }
9918 : :
9919 : : /* Re-use IVs when we can. We are generating further vector
9920 : : stmts by adding VF' * stride to the IVs generated above. */
9921 : 17068 : if (ivn < nvects)
9922 : : {
9923 : 4105 : if (nunits.is_constant (&const_nunits))
9924 : : {
9925 : 4105 : unsigned vfp = (least_common_multiple (group_size, const_nunits)
9926 : 4105 : / group_size);
9927 : 4105 : lupdate_mul
9928 : 4105 : = build_vector_from_val (step_vectype,
9929 : 4105 : SCALAR_FLOAT_TYPE_P (stept)
9930 : 8 : ? build_real_from_wide (stept,
9931 : 8 : vfp, UNSIGNED)
9932 : 8202 : : build_int_cstu (stept, vfp));
9933 : : }
9934 : : else
9935 : : {
9936 : : if (SCALAR_FLOAT_TYPE_P (stept))
9937 : : {
9938 : : tree tem = build_int_cst (integer_type_node, nunits);
9939 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9940 : : }
9941 : : else
9942 : : lupdate_mul = build_int_cst (stept, nunits);
9943 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9944 : : lupdate_mul);
9945 : : }
9946 : 12854 : for (; ivn < nvects; ++ivn)
9947 : : {
9948 : 8749 : gimple *iv
9949 : 8749 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
9950 : 8749 : tree def = gimple_get_lhs (iv);
9951 : 8749 : if (ivn < 2*nivs)
9952 : 4197 : vec_steps[ivn - nivs]
9953 : 4197 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9954 : 4197 : vec_steps[ivn - nivs], lupdate_mul);
9955 : 8749 : gimple_seq stmts = NULL;
9956 : 8749 : def = gimple_convert (&stmts, step_vectype, def);
9957 : 26247 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9958 : 8749 : def, vec_steps[ivn % nivs]);
9959 : 8749 : def = gimple_convert (&stmts, vectype, def);
9960 : 8749 : if (gimple_code (iv) == GIMPLE_PHI)
9961 : 4197 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9962 : : else
9963 : : {
9964 : 4552 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9965 : 4552 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9966 : : }
9967 : 8749 : slp_node->push_vec_def (def);
9968 : : }
9969 : : }
9970 : :
9971 : 17068 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9972 : 17068 : gcc_assert (!new_bb);
9973 : :
9974 : 17068 : return true;
9975 : 17068 : }
9976 : :
9977 : : /* Function vectorizable_live_operation_1.
9978 : :
9979 : : helper function for vectorizable_live_operation. */
9980 : :
9981 : : static tree
9982 : 5258 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo, basic_block exit_bb,
9983 : : tree vectype, slp_tree slp_node,
9984 : : tree bitsize, tree bitstart, tree vec_lhs,
9985 : : tree lhs_type, gimple_stmt_iterator *exit_gsi)
9986 : : {
9987 : 5258 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
9988 : :
9989 : 5258 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
9990 : 5258 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
9991 : 10909 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
9992 : 5651 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
9993 : :
9994 : 5258 : gimple_seq stmts = NULL;
9995 : 5258 : tree new_tree;
9996 : :
9997 : : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
9998 : 5258 : if (integer_zerop (bitstart))
9999 : : {
10000 : 2735 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10001 : : vec_lhs_phi, bitsize, bitstart);
10002 : :
10003 : : /* Convert the extracted vector element to the scalar type. */
10004 : 2735 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10005 : : }
10006 : 2523 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10007 : : {
10008 : : /* Emit:
10009 : :
10010 : : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - (BIAS + 1)>
10011 : :
10012 : : where VEC_LHS is the vectorized live-out result, LEN is the length of
10013 : : the vector, BIAS is the load-store bias. The bias should not be used
10014 : : at all since we are not using load/store operations, but LEN will be
10015 : : REALLEN + BIAS, so subtract it to get to the correct position. */
10016 : 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10017 : 0 : gimple_seq tem = NULL;
10018 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10019 : 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
10020 : : &LOOP_VINFO_LENS (loop_vinfo),
10021 : : 1, vectype, 0, 1);
10022 : 0 : gimple_seq_add_seq (&stmts, tem);
10023 : :
10024 : : /* BIAS + 1. */
10025 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10026 : 0 : tree bias_plus_one
10027 : 0 : = int_const_binop (PLUS_EXPR,
10028 : 0 : build_int_cst (TREE_TYPE (len), biasval),
10029 : 0 : build_one_cst (TREE_TYPE (len)));
10030 : :
10031 : : /* LAST_INDEX = LEN - (BIAS + 1). */
10032 : 0 : tree last_index = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len),
10033 : : len, bias_plus_one);
10034 : :
10035 : : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - (BIAS + 1)>. */
10036 : 0 : tree scalar_res
10037 : 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10038 : : vec_lhs_phi, last_index);
10039 : :
10040 : : /* Convert the extracted vector element to the scalar type. */
10041 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10042 : : }
10043 : 2523 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10044 : : {
10045 : : /* Emit:
10046 : :
10047 : : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10048 : :
10049 : : where VEC_LHS is the vectorized live-out result and MASK is
10050 : : the loop mask for the final iteration. */
10051 : 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10052 : 0 : tree scalar_type = TREE_TYPE (vectype);
10053 : 0 : gimple_seq tem = NULL;
10054 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10055 : 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10056 : : &LOOP_VINFO_MASKS (loop_vinfo),
10057 : : 1, vectype, 0);
10058 : 0 : tree scalar_res;
10059 : 0 : gimple_seq_add_seq (&stmts, tem);
10060 : :
10061 : 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10062 : : mask, vec_lhs_phi);
10063 : :
10064 : : /* Convert the extracted vector element to the scalar type. */
10065 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10066 : : }
10067 : : else
10068 : : {
10069 : 2523 : tree bftype = TREE_TYPE (vectype);
10070 : 2523 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10071 : 85 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10072 : 2523 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10073 : 2523 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10074 : : &stmts, true, NULL_TREE);
10075 : : }
10076 : :
10077 : 5258 : *exit_gsi = gsi_after_labels (exit_bb);
10078 : 5258 : if (stmts)
10079 : 5258 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10080 : :
10081 : 5258 : return new_tree;
10082 : : }
10083 : :
10084 : : /* Function vectorizable_live_operation.
10085 : :
10086 : : STMT_INFO computes a value that is used outside the loop. Check if
10087 : : it can be supported. */
10088 : :
10089 : : bool
10090 : 277931 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10091 : : slp_tree slp_node, slp_instance slp_node_instance,
10092 : : int slp_index, bool vec_stmt_p,
10093 : : stmt_vector_for_cost *cost_vec)
10094 : : {
10095 : 277931 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10096 : 277931 : imm_use_iterator imm_iter;
10097 : 277931 : tree lhs, lhs_type, bitsize;
10098 : 277931 : tree vectype = SLP_TREE_VECTYPE (slp_node);
10099 : 277931 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10100 : 277931 : gimple *use_stmt;
10101 : 277931 : use_operand_p use_p;
10102 : 277931 : auto_vec<tree> vec_oprnds;
10103 : 277931 : int vec_entry = 0;
10104 : 277931 : poly_uint64 vec_index = 0;
10105 : :
10106 : 277931 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10107 : : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10108 : :
10109 : : /* If a stmt of a reduction is live, vectorize it via
10110 : : vect_create_epilog_for_reduction. vectorizable_reduction assessed
10111 : : validity so just trigger the transform here. */
10112 : 277931 : if (vect_is_reduction (slp_node))
10113 : : {
10114 : 56728 : if (!vec_stmt_p)
10115 : : return true;
10116 : : /* For SLP reductions we vectorize the epilogue for all involved stmts
10117 : : together. For SLP reduction chains we only get here once. */
10118 : 23162 : if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
10119 : 22919 : && slp_index != 0)
10120 : : return true;
10121 : 22711 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
10122 : 22711 : if (VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10123 : 22711 : || VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10124 : : return true;
10125 : :
10126 : 21881 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10127 : 21881 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10128 : 21877 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10129 : : slp_node_instance,
10130 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10131 : :
10132 : : /* If early break we only have to materialize the reduction on the merge
10133 : : block, but we have to find an alternate exit first. */
10134 : 21881 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10135 : : {
10136 : 23 : slp_tree phis_node = slp_node_instance->reduc_phis;
10137 : 23 : stmt_info = SLP_TREE_REPRESENTATIVE (phis_node);
10138 : 69 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10139 : 23 : if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10140 : : {
10141 : 23 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10142 : : phis_node, slp_node_instance,
10143 : : exit);
10144 : 23 : break;
10145 : 23 : }
10146 : 23 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10147 : 4 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10148 : : phis_node, slp_node_instance,
10149 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10150 : : }
10151 : :
10152 : 21881 : return true;
10153 : : }
10154 : :
10155 : : /* If STMT is not relevant and it is a simple assignment and its inputs are
10156 : : invariant then it can remain in place, unvectorized. The original last
10157 : : scalar value that it computes will be used. */
10158 : 221203 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10159 : : {
10160 : 0 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10161 : 0 : if (dump_enabled_p ())
10162 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
10163 : : "statement is simple and uses invariant. Leaving in "
10164 : : "place.\n");
10165 : 0 : return true;
10166 : : }
10167 : :
10168 : 221203 : gcc_assert (slp_index >= 0);
10169 : :
10170 : : /* Get the last occurrence of the scalar index from the concatenation of
10171 : : all the slp vectors. Calculate which slp vector it is and the index
10172 : : within. */
10173 : 221203 : int num_scalar = SLP_TREE_LANES (slp_node);
10174 : 221203 : int num_vec = vect_get_num_copies (vinfo, slp_node);
10175 : 221203 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10176 : :
10177 : : /* Calculate which vector contains the result, and which lane of
10178 : : that vector we need. */
10179 : 221203 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10180 : : {
10181 : : if (dump_enabled_p ())
10182 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10183 : : "Cannot determine which vector holds the"
10184 : : " final result.\n");
10185 : : return false;
10186 : : }
10187 : :
10188 : 221203 : if (!vec_stmt_p)
10189 : : {
10190 : : /* No transformation required. */
10191 : 179773 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10192 : : {
10193 : 33117 : if (SLP_TREE_LANES (slp_node) != 1)
10194 : : {
10195 : 15 : if (dump_enabled_p ())
10196 : 15 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10197 : : "can't operate on partial vectors "
10198 : : "because an SLP statement is live after "
10199 : : "the loop.\n");
10200 : 15 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10201 : : }
10202 : 33102 : else if (num_vec > 1)
10203 : : {
10204 : 18109 : if (dump_enabled_p ())
10205 : 57 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10206 : : "can't operate on partial vectors "
10207 : : "because ncopies is greater than 1.\n");
10208 : 18109 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10209 : : }
10210 : : else
10211 : : {
10212 : 14993 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10213 : : OPTIMIZE_FOR_SPEED))
10214 : 0 : vect_record_loop_mask (loop_vinfo,
10215 : : &LOOP_VINFO_MASKS (loop_vinfo),
10216 : : 1, vectype, NULL);
10217 : 14993 : else if (can_vec_extract_var_idx_p (
10218 : 14993 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10219 : 0 : vect_record_loop_len (loop_vinfo,
10220 : : &LOOP_VINFO_LENS (loop_vinfo),
10221 : : 1, vectype, 1);
10222 : : else
10223 : : {
10224 : 14993 : if (dump_enabled_p ())
10225 : 840 : dump_printf_loc (
10226 : 840 : MSG_MISSED_OPTIMIZATION, vect_location,
10227 : : "can't operate on partial vectors "
10228 : : "because the target doesn't support extract "
10229 : : "last reduction.\n");
10230 : 14993 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10231 : : }
10232 : : }
10233 : : }
10234 : : /* ??? Enable for loop costing as well. */
10235 : 33117 : if (!loop_vinfo)
10236 : 89854 : record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
10237 : : 0, vect_epilogue);
10238 : 179773 : return true;
10239 : : }
10240 : :
10241 : : /* Use the lhs of the original scalar statement. */
10242 : 41430 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10243 : 41430 : if (dump_enabled_p ())
10244 : 1474 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10245 : : "stmt %G", stmt);
10246 : :
10247 : 41430 : lhs = gimple_get_lhs (stmt);
10248 : 41430 : lhs_type = TREE_TYPE (lhs);
10249 : :
10250 : 41430 : bitsize = vector_element_bits_tree (vectype);
10251 : :
10252 : : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10253 : 41430 : gcc_assert (!loop_vinfo
10254 : : || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10255 : : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10256 : : || SLP_TREE_LANES (slp_node) == 1));
10257 : :
10258 : : /* Get the correct slp vectorized stmt. */
10259 : 41430 : tree vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10260 : 41430 : gimple *vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10261 : :
10262 : : /* In case we need to early break vectorize also get the first stmt. */
10263 : 41430 : tree vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10264 : :
10265 : : /* Get entry to use. */
10266 : 41430 : tree bitstart = bitsize_int (vec_index);
10267 : 41430 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10268 : :
10269 : 41430 : if (loop_vinfo)
10270 : : {
10271 : : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10272 : : requirement, insert one phi node for it. It looks like:
10273 : : loop;
10274 : : BB:
10275 : : # lhs' = PHI <lhs>
10276 : : ==>
10277 : : loop;
10278 : : BB:
10279 : : # vec_lhs' = PHI <vec_lhs>
10280 : : new_tree = lane_extract <vec_lhs', ...>;
10281 : : lhs' = new_tree; */
10282 : :
10283 : 5303 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10284 : : /* Check if we have a loop where the chosen exit is not the main exit,
10285 : : in these cases for an early break we restart the iteration the vector code
10286 : : did. For the live values we want the value at the start of the iteration
10287 : : rather than at the end. */
10288 : 5303 : edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10289 : 5303 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10290 : 27840 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10291 : 17234 : if (!is_gimple_debug (use_stmt)
10292 : 17234 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10293 : 5258 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10294 : : {
10295 : 5258 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10296 : 5258 : phi_arg_index_from_use (use_p));
10297 : 5258 : gcc_assert (loop_exit_edge_p (loop, e));
10298 : 5258 : bool main_exit_edge = e == main_e;
10299 : 5258 : tree tmp_vec_lhs = vec_lhs;
10300 : 5258 : tree tmp_bitstart = bitstart;
10301 : :
10302 : : /* For early exit where the exit is not in the BB that leads
10303 : : to the latch then we're restarting the iteration in the
10304 : : scalar loop. So get the first live value. */
10305 : 13233 : bool early_break_first_element_p
10306 : 5258 : = (all_exits_as_early_p || !main_exit_edge)
10307 : 5258 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def;
10308 : 2717 : if (early_break_first_element_p)
10309 : : {
10310 : 2717 : tmp_vec_lhs = vec_lhs0;
10311 : 2717 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10312 : : }
10313 : :
10314 : 5258 : gimple_stmt_iterator exit_gsi;
10315 : 5258 : tree new_tree
10316 : 5258 : = vectorizable_live_operation_1 (loop_vinfo,
10317 : : e->dest, vectype,
10318 : : slp_node, bitsize,
10319 : : tmp_bitstart, tmp_vec_lhs,
10320 : : lhs_type, &exit_gsi);
10321 : :
10322 : 5258 : auto gsi = gsi_for_stmt (use_stmt);
10323 : 5258 : if (early_break_first_element_p
10324 : 2717 : && LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
10325 : : {
10326 : 0 : tree step_expr
10327 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10328 : 0 : tree break_lhs_phi
10329 : : = LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo);
10330 : 0 : tree ty_skip_niters = TREE_TYPE (break_lhs_phi);
10331 : 0 : gimple_seq iv_stmts = NULL;
10332 : :
10333 : : /* Now create the PHI for the outside loop usage to
10334 : : retrieve the value for the offset counter. */
10335 : 0 : tree rphi_step
10336 : 0 : = gimple_convert (&iv_stmts, ty_skip_niters, step_expr);
10337 : 0 : tree tmp2
10338 : 0 : = gimple_build (&iv_stmts, MULT_EXPR,
10339 : : ty_skip_niters, rphi_step,
10340 : : break_lhs_phi);
10341 : :
10342 : 0 : if (POINTER_TYPE_P (TREE_TYPE (new_tree)))
10343 : : {
10344 : 0 : tmp2 = gimple_convert (&iv_stmts, sizetype, tmp2);
10345 : 0 : tmp2 = gimple_build (&iv_stmts, POINTER_PLUS_EXPR,
10346 : 0 : TREE_TYPE (new_tree), new_tree,
10347 : : tmp2);
10348 : : }
10349 : : else
10350 : : {
10351 : 0 : tmp2 = gimple_convert (&iv_stmts, TREE_TYPE (new_tree),
10352 : : tmp2);
10353 : 0 : tmp2 = gimple_build (&iv_stmts, PLUS_EXPR,
10354 : 0 : TREE_TYPE (new_tree), new_tree,
10355 : : tmp2);
10356 : : }
10357 : :
10358 : 0 : new_tree = tmp2;
10359 : 0 : gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
10360 : : }
10361 : :
10362 : 5258 : tree lhs_phi = gimple_phi_result (use_stmt);
10363 : 5258 : remove_phi_node (&gsi, false);
10364 : 5258 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10365 : 5258 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10366 : 5258 : break;
10367 : 5303 : }
10368 : :
10369 : : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10370 : 22582 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10371 : 11976 : gcc_assert (is_gimple_debug (use_stmt)
10372 : 5303 : || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10373 : : }
10374 : : else
10375 : : {
10376 : : /* For basic-block vectorization simply insert the lane-extraction. */
10377 : 36127 : tree bftype = TREE_TYPE (vectype);
10378 : 36127 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10379 : 0 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10380 : 36127 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
10381 : : vec_lhs, bitsize, bitstart);
10382 : 36127 : gimple_seq stmts = NULL;
10383 : 36127 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10384 : : &stmts, true, NULL_TREE);
10385 : 36127 : if (TREE_CODE (new_tree) == SSA_NAME
10386 : 72254 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10387 : 2 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10388 : 36127 : if (is_a <gphi *> (vec_stmt))
10389 : : {
10390 : 2693 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10391 : 2693 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10392 : : }
10393 : : else
10394 : : {
10395 : 33434 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10396 : 33434 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10397 : : }
10398 : :
10399 : : /* Replace use of lhs with newly computed result. If the use stmt is a
10400 : : single arg PHI, just replace all uses of PHI result. It's necessary
10401 : : because lcssa PHI defining lhs may be before newly inserted stmt. */
10402 : 36127 : use_operand_p use_p;
10403 : 36127 : stmt_vec_info use_stmt_info;
10404 : 234925 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10405 : 162671 : if (!is_gimple_debug (use_stmt)
10406 : 162671 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10407 : 108520 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10408 : : {
10409 : : /* ??? This can happen when the live lane ends up being
10410 : : rooted in a vector construction code-generated by an
10411 : : external SLP node (and code-generation for that already
10412 : : happened). See gcc.dg/vect/bb-slp-47.c.
10413 : : Doing this is what would happen if that vector CTOR
10414 : : were not code-generated yet so it is not too bad.
10415 : : ??? In fact we'd likely want to avoid this situation
10416 : : in the first place. */
10417 : 63182 : if (TREE_CODE (new_tree) == SSA_NAME
10418 : 62918 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10419 : 62918 : && gimple_code (use_stmt) != GIMPLE_PHI
10420 : 118303 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10421 : : use_stmt))
10422 : : {
10423 : 264 : if (dump_enabled_p ())
10424 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10425 : : "Using original scalar computation for "
10426 : : "live lane because use preceeds vector "
10427 : : "def\n");
10428 : 264 : continue;
10429 : : }
10430 : : /* ??? It can also happen that we end up pulling a def into
10431 : : a loop where replacing out-of-loop uses would require
10432 : : a new LC SSA PHI node. Retain the original scalar in
10433 : : those cases as well. PR98064. */
10434 : 64077 : if (TREE_CODE (new_tree) == SSA_NAME
10435 : 62654 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10436 : 62654 : && (gimple_bb (use_stmt)->loop_father
10437 : 62654 : != gimple_bb (vec_stmt)->loop_father)
10438 : 69529 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10439 : 6875 : gimple_bb (use_stmt)->loop_father))
10440 : : {
10441 : 1423 : if (dump_enabled_p ())
10442 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10443 : : "Using original scalar computation for "
10444 : : "live lane because there is an out-of-loop "
10445 : : "definition for it\n");
10446 : 1423 : continue;
10447 : : }
10448 : 188287 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10449 : 63528 : SET_USE (use_p, new_tree);
10450 : 61231 : update_stmt (use_stmt);
10451 : 36127 : }
10452 : : }
10453 : :
10454 : : return true;
10455 : 277931 : }
10456 : :
10457 : : /* Given loop represented by LOOP_VINFO, return true if computation of
10458 : : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10459 : : otherwise. */
10460 : :
10461 : : static bool
10462 : 60606 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
10463 : : {
10464 : : /* Constant case. */
10465 : 60606 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10466 : : {
10467 : 35470 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10468 : 35470 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10469 : :
10470 : 35470 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10471 : 35470 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10472 : 35470 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10473 : : return true;
10474 : : }
10475 : :
10476 : 25136 : widest_int max;
10477 : 25136 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10478 : : /* Check the upper bound of loop niters. */
10479 : 25136 : if (get_max_loop_iterations (loop, &max))
10480 : : {
10481 : 25136 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10482 : 25136 : signop sgn = TYPE_SIGN (type);
10483 : 25136 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10484 : 25136 : if (max < type_max)
10485 : 24915 : return true;
10486 : 25136 : }
10487 : : return false;
10488 : 25136 : }
10489 : :
10490 : : /* Return a mask type with half the number of elements as OLD_TYPE,
10491 : : given that it should have mode NEW_MODE. */
10492 : :
10493 : : tree
10494 : 3920 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10495 : : {
10496 : 3920 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10497 : 3920 : return build_truth_vector_type_for_mode (nunits, new_mode);
10498 : : }
10499 : :
10500 : : /* Return a mask type with twice as many elements as OLD_TYPE,
10501 : : given that it should have mode NEW_MODE. */
10502 : :
10503 : : tree
10504 : 5911 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10505 : : {
10506 : 5911 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10507 : 5911 : return build_truth_vector_type_for_mode (nunits, new_mode);
10508 : : }
10509 : :
10510 : : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10511 : : contain a sequence of NVECTORS masks that each control a vector of type
10512 : : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10513 : : these vector masks with the vector version of SCALAR_MASK. */
10514 : :
10515 : : void
10516 : 77650 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10517 : : unsigned int nvectors, tree vectype, tree scalar_mask)
10518 : : {
10519 : 77650 : gcc_assert (nvectors != 0);
10520 : :
10521 : 77650 : if (scalar_mask)
10522 : : {
10523 : 3562 : scalar_cond_masked_key cond (scalar_mask, nvectors);
10524 : 3562 : loop_vinfo->scalar_cond_masked_set.add (cond);
10525 : : }
10526 : :
10527 : 77650 : masks->mask_set.add (std::make_pair (vectype, nvectors));
10528 : 77650 : }
10529 : :
10530 : : /* Given a complete set of masks MASKS, extract mask number INDEX
10531 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10532 : : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10533 : :
10534 : : See the comment above vec_loop_masks for more details about the mask
10535 : : arrangement. */
10536 : :
10537 : : tree
10538 : 203 : vect_get_loop_mask (loop_vec_info loop_vinfo,
10539 : : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10540 : : unsigned int nvectors, tree vectype, unsigned int index)
10541 : : {
10542 : 203 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10543 : : == vect_partial_vectors_while_ult)
10544 : : {
10545 : 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10546 : 0 : tree mask_type = rgm->type;
10547 : :
10548 : : /* Populate the rgroup's mask array, if this is the first time we've
10549 : : used it. */
10550 : 0 : if (rgm->controls.is_empty ())
10551 : : {
10552 : 0 : rgm->controls.safe_grow_cleared (nvectors, true);
10553 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
10554 : : {
10555 : 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10556 : : /* Provide a dummy definition until the real one is available. */
10557 : 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10558 : 0 : rgm->controls[i] = mask;
10559 : : }
10560 : : }
10561 : :
10562 : 0 : tree mask = rgm->controls[index];
10563 : 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10564 : 0 : TYPE_VECTOR_SUBPARTS (vectype)))
10565 : : {
10566 : : /* A loop mask for data type X can be reused for data type Y
10567 : : if X has N times more elements than Y and if Y's elements
10568 : : are N times bigger than X's. In this case each sequence
10569 : : of N elements in the loop mask will be all-zero or all-one.
10570 : : We can then view-convert the mask so that each sequence of
10571 : : N elements is replaced by a single element. */
10572 : 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10573 : : TYPE_VECTOR_SUBPARTS (vectype)));
10574 : 0 : gimple_seq seq = NULL;
10575 : 0 : mask_type = truth_type_for (vectype);
10576 : 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10577 : 0 : if (seq)
10578 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10579 : : }
10580 : 0 : return mask;
10581 : : }
10582 : 203 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10583 : : == vect_partial_vectors_avx512)
10584 : : {
10585 : : /* The number of scalars per iteration and the number of vectors are
10586 : : both compile-time constants. */
10587 : 203 : unsigned int nscalars_per_iter
10588 : 203 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10589 : 203 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10590 : :
10591 : 203 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10592 : :
10593 : : /* The stored nV is dependent on the mask type produced. */
10594 : 203 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10595 : : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10596 : : == rgm->factor);
10597 : 203 : nvectors = rgm->factor;
10598 : :
10599 : : /* Populate the rgroup's mask array, if this is the first time we've
10600 : : used it. */
10601 : 203 : if (rgm->controls.is_empty ())
10602 : : {
10603 : 19 : rgm->controls.safe_grow_cleared (nvectors, true);
10604 : 104 : for (unsigned int i = 0; i < nvectors; ++i)
10605 : : {
10606 : 85 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10607 : : /* Provide a dummy definition until the real one is available. */
10608 : 85 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10609 : 85 : rgm->controls[i] = mask;
10610 : : }
10611 : : }
10612 : 203 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10613 : : TYPE_VECTOR_SUBPARTS (vectype)))
10614 : 155 : return rgm->controls[index];
10615 : :
10616 : : /* Split the vector if needed. Since we are dealing with integer mode
10617 : : masks with AVX512 we can operate on the integer representation
10618 : : performing the whole vector shifting. */
10619 : 48 : unsigned HOST_WIDE_INT factor;
10620 : 48 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10621 : 48 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
10622 : 0 : gcc_assert (ok);
10623 : 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10624 : 48 : tree mask_type = truth_type_for (vectype);
10625 : 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10626 : 48 : unsigned vi = index / factor;
10627 : 48 : unsigned vpart = index % factor;
10628 : 48 : tree vec = rgm->controls[vi];
10629 : 48 : gimple_seq seq = NULL;
10630 : 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10631 : 48 : lang_hooks.types.type_for_mode
10632 : 48 : (TYPE_MODE (rgm->type), 1), vec);
10633 : : /* For integer mode masks simply shift the right bits into position. */
10634 : 48 : if (vpart != 0)
10635 : 40 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10636 : : build_int_cst (integer_type_node,
10637 : 80 : (TYPE_VECTOR_SUBPARTS (vectype)
10638 : 40 : * vpart)));
10639 : 48 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10640 : 48 : (TYPE_MODE (mask_type), 1), vec);
10641 : 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10642 : 48 : if (seq)
10643 : 48 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10644 : 48 : return vec;
10645 : : }
10646 : : else
10647 : 0 : gcc_unreachable ();
10648 : : }
10649 : :
10650 : : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10651 : : lengths for controlling an operation on VECTYPE. The operation splits
10652 : : each element of VECTYPE into FACTOR separate subelements, measuring the
10653 : : length as a number of these subelements. */
10654 : :
10655 : : void
10656 : 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10657 : : unsigned int nvectors, tree vectype, unsigned int factor)
10658 : : {
10659 : 0 : gcc_assert (nvectors != 0);
10660 : 0 : if (lens->length () < nvectors)
10661 : 0 : lens->safe_grow_cleared (nvectors, true);
10662 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10663 : :
10664 : : /* The number of scalars per iteration, scalar occupied bytes and
10665 : : the number of vectors are both compile-time constants. */
10666 : 0 : unsigned int nscalars_per_iter
10667 : 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10668 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10669 : :
10670 : 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10671 : : {
10672 : : /* For now, we only support cases in which all loads and stores fall back
10673 : : to VnQI or none do. */
10674 : 0 : gcc_assert (!rgl->max_nscalars_per_iter
10675 : : || (rgl->factor == 1 && factor == 1)
10676 : : || (rgl->max_nscalars_per_iter * rgl->factor
10677 : : == nscalars_per_iter * factor));
10678 : 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
10679 : 0 : rgl->type = vectype;
10680 : 0 : rgl->factor = factor;
10681 : : }
10682 : 0 : }
10683 : :
10684 : : /* Given a complete set of lengths LENS, extract length number INDEX
10685 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10686 : : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10687 : : multipled by the number of elements that should be processed.
10688 : : Insert any set-up statements before GSI. */
10689 : :
10690 : : tree
10691 : 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10692 : : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10693 : : unsigned int index, unsigned int factor)
10694 : : {
10695 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10696 : 0 : bool use_bias_adjusted_len =
10697 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10698 : :
10699 : : /* Populate the rgroup's len array, if this is the first time we've
10700 : : used it. */
10701 : 0 : if (rgl->controls.is_empty ())
10702 : : {
10703 : 0 : rgl->controls.safe_grow_cleared (nvectors, true);
10704 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
10705 : : {
10706 : 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10707 : 0 : gcc_assert (len_type != NULL_TREE);
10708 : :
10709 : 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10710 : :
10711 : : /* Provide a dummy definition until the real one is available. */
10712 : 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10713 : 0 : rgl->controls[i] = len;
10714 : :
10715 : 0 : if (use_bias_adjusted_len)
10716 : : {
10717 : 0 : gcc_assert (i == 0);
10718 : 0 : tree adjusted_len =
10719 : 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10720 : 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10721 : 0 : rgl->bias_adjusted_ctrl = adjusted_len;
10722 : : }
10723 : : }
10724 : : }
10725 : :
10726 : 0 : if (use_bias_adjusted_len)
10727 : 0 : return rgl->bias_adjusted_ctrl;
10728 : :
10729 : 0 : tree loop_len = rgl->controls[index];
10730 : 0 : if (rgl->factor == 1 && factor == 1)
10731 : : {
10732 : 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10733 : 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10734 : 0 : if (maybe_ne (nunits1, nunits2))
10735 : : {
10736 : : /* A loop len for data type X can be reused for data type Y
10737 : : if X has N times more elements than Y and if Y's elements
10738 : : are N times bigger than X's. */
10739 : 0 : gcc_assert (multiple_p (nunits1, nunits2));
10740 : 0 : factor = exact_div (nunits1, nunits2).to_constant ();
10741 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10742 : 0 : gimple_seq seq = NULL;
10743 : 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10744 : 0 : build_int_cst (iv_type, factor));
10745 : 0 : if (seq)
10746 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10747 : : }
10748 : : }
10749 : : return loop_len;
10750 : : }
10751 : :
10752 : : /* Generate the tree for the loop len mask and return it. Given the lens,
10753 : : nvectors, vectype, index and factor to gen the len mask as below.
10754 : :
10755 : : tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
10756 : : */
10757 : : tree
10758 : 0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10759 : : gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
10760 : : unsigned int nvectors, tree vectype, tree stmt,
10761 : : unsigned int index, unsigned int factor)
10762 : : {
10763 : 0 : tree all_one_mask = build_all_ones_cst (vectype);
10764 : 0 : tree all_zero_mask = build_zero_cst (vectype);
10765 : 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
10766 : : factor);
10767 : 0 : tree bias = build_int_cst (intQI_type_node,
10768 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
10769 : 0 : tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
10770 : 0 : gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
10771 : : all_one_mask, all_zero_mask, len,
10772 : : bias);
10773 : 0 : gimple_call_set_lhs (call, len_mask);
10774 : 0 : gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
10775 : :
10776 : 0 : return len_mask;
10777 : : }
10778 : :
10779 : : /* Scale profiling counters by estimation for LOOP which is vectorized
10780 : : by factor VF.
10781 : : If FLAT is true, the loop we started with had unrealistically flat
10782 : : profile. */
10783 : :
10784 : : static void
10785 : 60606 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
10786 : : {
10787 : : /* For flat profiles do not scale down proportionally by VF and only
10788 : : cap by known iteration count bounds. */
10789 : 60606 : if (flat)
10790 : : {
10791 : 33852 : if (dump_file && (dump_flags & TDF_DETAILS))
10792 : 5044 : fprintf (dump_file,
10793 : : "Vectorized loop profile seems flat; not scaling iteration "
10794 : : "count down by the vectorization factor %i\n", vf);
10795 : 33852 : scale_loop_profile (loop, profile_probability::always (),
10796 : : get_likely_max_loop_iterations_int (loop));
10797 : 33852 : return;
10798 : : }
10799 : : /* Loop body executes VF fewer times and exit increases VF times. */
10800 : 26754 : profile_count entry_count = loop_preheader_edge (loop)->count ();
10801 : :
10802 : : /* If we have unreliable loop profile avoid dropping entry
10803 : : count below header count. This can happen since loops
10804 : : has unrealistically low trip counts. */
10805 : 26754 : while (vf > 1
10806 : 28001 : && loop->header->count > entry_count
10807 : 56891 : && loop->header->count < entry_count * vf)
10808 : : {
10809 : 2136 : if (dump_file && (dump_flags & TDF_DETAILS))
10810 : 149 : fprintf (dump_file,
10811 : : "Vectorization factor %i seems too large for profile "
10812 : : "prevoiusly believed to be consistent; reducing.\n", vf);
10813 : 2136 : vf /= 2;
10814 : : }
10815 : :
10816 : 26754 : if (entry_count.nonzero_p ())
10817 : 26754 : set_edge_probability_and_rescale_others
10818 : 26754 : (exit_e,
10819 : 26754 : entry_count.probability_in (loop->header->count / vf));
10820 : : /* Avoid producing very large exit probability when we do not have
10821 : : sensible profile. */
10822 : 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
10823 : 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10824 : 26754 : loop->latch->count = single_pred_edge (loop->latch)->count ();
10825 : :
10826 : 26754 : scale_loop_profile (loop, profile_probability::always () / vf,
10827 : : get_likely_max_loop_iterations_int (loop));
10828 : : }
10829 : :
10830 : : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10831 : : original loop that has now been vectorized.
10832 : :
10833 : : The inits of the data_references need to be advanced with the number of
10834 : : iterations of the main loop. This has been computed in vect_do_peeling and
10835 : : is stored in parameter ADVANCE.
10836 : :
10837 : : Since the loop_vec_info of this EPILOGUE was constructed for the original
10838 : : loop, its stmt_vec_infos all point to the original statements. These need
10839 : : to be updated to point to their corresponding copies.
10840 : :
10841 : : The data_reference's connections also need to be updated. Their
10842 : : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10843 : : stmt_vec_infos, their statements need to point to their corresponding
10844 : : copy. */
10845 : :
10846 : : static void
10847 : 6851 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10848 : : {
10849 : 6851 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10850 : 6851 : hash_map<tree,tree> mapping;
10851 : 6851 : gimple *orig_stmt, *new_stmt;
10852 : 6851 : gimple_stmt_iterator epilogue_gsi;
10853 : 6851 : gphi_iterator epilogue_phi_gsi;
10854 : 6851 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10855 : 6851 : basic_block *epilogue_bbs = get_loop_body (epilogue);
10856 : 6851 : unsigned i;
10857 : :
10858 : 6851 : free (LOOP_VINFO_BBS (epilogue_vinfo));
10859 : 6851 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10860 : 6851 : LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
10861 : :
10862 : : /* The EPILOGUE loop is a copy of the original loop so they share the same
10863 : : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10864 : : point to the copied statements. */
10865 : 20553 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10866 : : {
10867 : 13702 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10868 : 35310 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10869 : : {
10870 : 21608 : new_stmt = epilogue_phi_gsi.phi ();
10871 : :
10872 : 21608 : gcc_assert (gimple_uid (new_stmt) > 0);
10873 : 21608 : stmt_vinfo
10874 : 21608 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10875 : :
10876 : 21608 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10877 : : }
10878 : :
10879 : 27404 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10880 : 136225 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10881 : : {
10882 : 122523 : new_stmt = gsi_stmt (epilogue_gsi);
10883 : 122523 : if (is_gimple_debug (new_stmt))
10884 : 21865 : continue;
10885 : :
10886 : 100658 : gcc_assert (gimple_uid (new_stmt) > 0);
10887 : 100658 : stmt_vinfo
10888 : 100658 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10889 : :
10890 : 100658 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10891 : :
10892 : 100658 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10893 : 100658 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10894 : : {
10895 : 1854 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10896 : : /* Set BB such that the assert in
10897 : : 'get_initial_defs_for_reduction' is able to determine that
10898 : : the BB of the related stmt is inside this loop. */
10899 : 1854 : gimple_set_bb (stmt,
10900 : : gimple_bb (new_stmt));
10901 : 1854 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10902 : 1854 : gcc_assert (related_vinfo == NULL
10903 : : || related_vinfo == stmt_vinfo);
10904 : : }
10905 : : }
10906 : : }
10907 : :
10908 : 6851 : struct data_reference *dr;
10909 : 6851 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10910 : 29134 : FOR_EACH_VEC_ELT (datarefs, i, dr)
10911 : : {
10912 : 22283 : orig_stmt = DR_STMT (dr);
10913 : 22283 : gcc_assert (gimple_uid (orig_stmt) > 0);
10914 : 22283 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10915 : 22283 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10916 : : }
10917 : :
10918 : : /* Advance data_reference's with the number of iterations of the previous
10919 : : loop and its prologue. */
10920 : 6851 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10921 : :
10922 : : /* Remember the advancement made. */
10923 : 6851 : LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
10924 : 6851 : }
10925 : :
10926 : : /* When vectorizing early break statements instructions that happen before
10927 : : the early break in the current BB need to be moved to after the early
10928 : : break. This function deals with that and assumes that any validity
10929 : : checks has already been performed.
10930 : :
10931 : : While moving the instructions if it encounters a VUSE or VDEF it then
10932 : : corrects the VUSES as it moves the statements along. GDEST is the location
10933 : : in which to insert the new statements. */
10934 : :
10935 : : static void
10936 : 1436 : move_early_exit_stmts (loop_vec_info loop_vinfo)
10937 : : {
10938 : 1436 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
10939 : :
10940 : 1436 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
10941 : 1230 : return;
10942 : :
10943 : : /* Move all stmts that need moving. */
10944 : 206 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
10945 : 206 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
10946 : :
10947 : 206 : tree last_seen_vuse = NULL_TREE;
10948 : 511 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
10949 : : {
10950 : : /* We have to update crossed degenerate virtual PHIs. Simply
10951 : : elide them. */
10952 : 305 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
10953 : : {
10954 : 7 : tree vdef = gimple_phi_result (vphi);
10955 : 7 : tree vuse = gimple_phi_arg_def (vphi, 0);
10956 : 7 : imm_use_iterator iter;
10957 : 7 : use_operand_p use_p;
10958 : 7 : gimple *use_stmt;
10959 : 30 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
10960 : : {
10961 : 48 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
10962 : 16 : SET_USE (use_p, vuse);
10963 : 7 : }
10964 : 7 : auto gsi = gsi_for_stmt (stmt);
10965 : 7 : remove_phi_node (&gsi, true);
10966 : 7 : last_seen_vuse = vuse;
10967 : 7 : continue;
10968 : 7 : }
10969 : :
10970 : : /* Check to see if statement is still required for vect or has been
10971 : : elided. */
10972 : 298 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
10973 : 298 : if (!stmt_info)
10974 : 0 : continue;
10975 : :
10976 : 298 : if (dump_enabled_p ())
10977 : 147 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
10978 : :
10979 : 298 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
10980 : 298 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
10981 : 596 : last_seen_vuse = gimple_vuse (stmt);
10982 : : }
10983 : :
10984 : : /* Update all the stmts with their new reaching VUSES. */
10985 : 630 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
10986 : : {
10987 : 178 : if (dump_enabled_p ())
10988 : 142 : dump_printf_loc (MSG_NOTE, vect_location,
10989 : : "updating vuse to %T for load %G",
10990 : : last_seen_vuse, p);
10991 : 178 : gimple_set_vuse (p, last_seen_vuse);
10992 : 178 : update_stmt (p);
10993 : : }
10994 : :
10995 : : /* And update the LC PHIs on exits. */
10996 : 1036 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10997 : 418 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
10998 : 220 : if (gphi *phi = get_virtual_phi (e->dest))
10999 : 426 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11000 : : }
11001 : :
11002 : : /* Function vect_transform_loop.
11003 : :
11004 : : The analysis phase has determined that the loop is vectorizable.
11005 : : Vectorize the loop - created vectorized stmts to replace the scalar
11006 : : stmts in the loop, and update the loop exit condition.
11007 : : Returns scalar epilogue loop if any. */
11008 : :
11009 : : class loop *
11010 : 60606 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11011 : : {
11012 : 60606 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11013 : 60606 : class loop *epilogue = NULL;
11014 : 60606 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11015 : 60606 : int nbbs = loop->num_nodes;
11016 : 60606 : int i;
11017 : 60606 : tree niters_vector = NULL_TREE;
11018 : 60606 : tree step_vector = NULL_TREE;
11019 : 60606 : tree niters_vector_mult_vf = NULL_TREE;
11020 : 60606 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11021 : 60606 : unsigned int lowest_vf = constant_lower_bound (vf);
11022 : 60606 : gimple *stmt;
11023 : 60606 : bool check_profitability = false;
11024 : 60606 : unsigned int th;
11025 : 60606 : bool flat = maybe_flat_loop_profile (loop);
11026 : :
11027 : 60606 : DUMP_VECT_SCOPE ("vec_transform_loop");
11028 : :
11029 : 60606 : if (! LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11030 : 53755 : loop_vinfo->shared->check_datarefs ();
11031 : :
11032 : : /* Use the more conservative vectorization threshold. If the number
11033 : : of iterations is constant assume the cost check has been performed
11034 : : by our caller. If the threshold makes all loops profitable that
11035 : : run at least the (estimated) vectorization factor number of times
11036 : : checking is pointless, too. */
11037 : 60606 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11038 : 60606 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11039 : : {
11040 : 18203 : if (dump_enabled_p ())
11041 : 172 : dump_printf_loc (MSG_NOTE, vect_location,
11042 : : "Profitability threshold is %d loop iterations.\n",
11043 : : th);
11044 : : check_profitability = true;
11045 : : }
11046 : :
11047 : : /* Make sure there exists a single-predecessor exit bb. Do this before
11048 : : versioning. */
11049 : 60606 : edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11050 : 60606 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11051 : : {
11052 : 18674 : split_loop_exit_edge (e, true);
11053 : 18674 : if (dump_enabled_p ())
11054 : 2219 : dump_printf (MSG_NOTE, "split exit edge\n");
11055 : : }
11056 : :
11057 : : /* Version the loop first, if required, so the profitability check
11058 : : comes first. */
11059 : :
11060 : 60606 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11061 : : {
11062 : 3686 : class loop *sloop
11063 : 3686 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11064 : 3686 : sloop->force_vectorize = false;
11065 : 3686 : check_profitability = false;
11066 : : }
11067 : :
11068 : : /* Make sure there exists a single-predecessor exit bb also on the
11069 : : scalar loop copy. Do this after versioning but before peeling
11070 : : so CFG structure is fine for both scalar and if-converted loop
11071 : : to make slpeel_duplicate_current_defs_from_edges face matched
11072 : : loop closed PHI nodes on the exit. */
11073 : 60606 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11074 : : {
11075 : 7986 : e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11076 : 7986 : if (! single_pred_p (e->dest))
11077 : : {
11078 : 7723 : split_loop_exit_edge (e, true);
11079 : 7723 : if (dump_enabled_p ())
11080 : 1124 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11081 : : }
11082 : : }
11083 : :
11084 : 60606 : tree niters = vect_build_loop_niters (loop_vinfo);
11085 : 60606 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11086 : 60606 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11087 : 60606 : bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11088 : 60606 : tree advance;
11089 : 60606 : drs_init_vec orig_drs_init;
11090 : :
11091 : 60606 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11092 : : &step_vector, &niters_vector_mult_vf, th,
11093 : : check_profitability, niters_no_overflow,
11094 : : &advance);
11095 : 60606 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11096 : 60606 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11097 : : {
11098 : : /* Ifcvt duplicates loop preheader, loop body and produces an basic
11099 : : block after loop exit. We need to scale all that. */
11100 : 89 : basic_block preheader
11101 : 89 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11102 : 89 : preheader->count
11103 : : = preheader->count.apply_probability
11104 : 89 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11105 : 89 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11106 : : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11107 : 89 : LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11108 : : }
11109 : :
11110 : 60606 : if (niters_vector == NULL_TREE)
11111 : : {
11112 : 26979 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11113 : 26979 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11114 : 54703 : && known_eq (lowest_vf, vf))
11115 : : {
11116 : 26976 : niters_vector
11117 : 26976 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11118 : 26976 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11119 : 26976 : step_vector = build_one_cst (TREE_TYPE (niters));
11120 : : }
11121 : 751 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11122 : 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11123 : : &step_vector, niters_no_overflow);
11124 : : else
11125 : : /* vect_do_peeling subtracted the number of peeled prologue
11126 : : iterations from LOOP_VINFO_NITERS. */
11127 : 750 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11128 : : &niters_vector, &step_vector,
11129 : : niters_no_overflow);
11130 : : }
11131 : :
11132 : : /* 1) Make sure the loop header has exactly two entries
11133 : : 2) Make sure we have a preheader basic block. */
11134 : :
11135 : 60606 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11136 : :
11137 : 60606 : split_edge (loop_preheader_edge (loop));
11138 : :
11139 : 60606 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11140 : : /* This will deal with any possible peeling. */
11141 : 1 : vect_prepare_for_masked_peels (loop_vinfo);
11142 : :
11143 : : /* Handle any code motion that we need to for early-break vectorization after
11144 : : we've done peeling but just before we start vectorizing. */
11145 : 60606 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11146 : 1436 : move_early_exit_stmts (loop_vinfo);
11147 : :
11148 : : /* Remove existing clobber stmts and prefetches. */
11149 : 185005 : for (i = 0; i < nbbs; i++)
11150 : : {
11151 : 124399 : basic_block bb = bbs[i];
11152 : 1069975 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
11153 : : {
11154 : 821177 : stmt = gsi_stmt (si);
11155 : 821177 : if (gimple_clobber_p (stmt)
11156 : 821177 : || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
11157 : : {
11158 : 90 : unlink_stmt_vdef (stmt);
11159 : 90 : gsi_remove (&si, true);
11160 : 90 : release_defs (stmt);
11161 : : }
11162 : : else
11163 : 821087 : gsi_next (&si);
11164 : : }
11165 : : }
11166 : :
11167 : : /* Schedule the SLP instances. */
11168 : 60606 : if (!loop_vinfo->slp_instances.is_empty ())
11169 : : {
11170 : 60606 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11171 : 60606 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11172 : : }
11173 : :
11174 : : /* Generate the loop invariant statements. */
11175 : 60606 : if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
11176 : : {
11177 : 74 : if (dump_enabled_p ())
11178 : 30 : dump_printf_loc (MSG_NOTE, vect_location,
11179 : : "------>generating loop invariant statements\n");
11180 : 74 : gimple_stmt_iterator gsi;
11181 : 74 : gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
11182 : 74 : gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
11183 : : GSI_CONTINUE_LINKING);
11184 : : }
11185 : :
11186 : : /* Stub out scalar statements that must not survive vectorization and
11187 : : were not picked as relevant in any SLP instance.
11188 : : Doing this here helps with grouped statements, or statements that
11189 : : are involved in patterns. */
11190 : 185005 : for (i = 0; i < nbbs; i++)
11191 : : {
11192 : 124399 : basic_block bb = bbs[i];
11193 : 124399 : stmt_vec_info stmt_info;
11194 : 248798 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11195 : 1638120 : !gsi_end_p (gsi); gsi_next (&gsi))
11196 : : {
11197 : 1513721 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11198 : 6240 : if (!call || !gimple_call_internal_p (call))
11199 : 1508635 : continue;
11200 : 5086 : internal_fn ifn = gimple_call_internal_fn (call);
11201 : 5086 : if (ifn == IFN_MASK_LOAD)
11202 : : {
11203 : 657 : tree lhs = gimple_get_lhs (call);
11204 : 657 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11205 : : {
11206 : 0 : tree zero = build_zero_cst (TREE_TYPE (lhs));
11207 : 0 : gimple *new_stmt = gimple_build_assign (lhs, zero);
11208 : 0 : gsi_replace (&gsi, new_stmt, true);
11209 : : }
11210 : : }
11211 : 4429 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11212 : : {
11213 : 2295 : tree lhs = gimple_get_lhs (call);
11214 : 2295 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11215 : : {
11216 : 0 : tree else_arg
11217 : 0 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11218 : 0 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11219 : 0 : gsi_replace (&gsi, new_stmt, true);
11220 : : }
11221 : : }
11222 : 2134 : else if (ifn == IFN_MASK_CALL
11223 : 4 : && (stmt_info = loop_vinfo->lookup_stmt (call))
11224 : 4 : && !STMT_VINFO_RELEVANT_P (stmt_info)
11225 : 2138 : && !STMT_VINFO_LIVE_P (stmt_info))
11226 : : {
11227 : 4 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11228 : 4 : loop_vinfo->remove_stmt (stmt_info);
11229 : : }
11230 : : }
11231 : : }
11232 : :
11233 : : /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11234 : : a zero NITERS becomes a nonzero NITERS_VECTOR. */
11235 : 60606 : if (integer_onep (step_vector))
11236 : 60589 : niters_no_overflow = true;
11237 : 60606 : vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11238 : : niters_vector, step_vector, niters_vector_mult_vf,
11239 : 60606 : !niters_no_overflow);
11240 : :
11241 : 60606 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11242 : :
11243 : : /* True if the final iteration might not handle a full vector's
11244 : : worth of scalar iterations. */
11245 : 121212 : bool final_iter_may_be_partial
11246 : 60606 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11247 : 60606 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
11248 : :
11249 : : /* +1 to convert latch counts to loop iteration counts. */
11250 : 60606 : int bias_for_lowest = 1;
11251 : :
11252 : : /* When we are peeling for gaps then we take away one scalar iteration
11253 : : from the vector loop. Thus we can adjust the upper bound by one
11254 : : scalar iteration. But only when we know the bound applies to the
11255 : : IV exit test which might not be true when we have multiple exits. */
11256 : 60606 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11257 : 117987 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11258 : :
11259 : 60606 : int bias_for_assumed = bias_for_lowest;
11260 : 60606 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11261 : 60606 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11262 : : {
11263 : : /* When the amount of peeling is known at compile time, the first
11264 : : iteration will have exactly alignment_npeels active elements.
11265 : : In the worst case it will have at least one. */
11266 : 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11267 : 1 : bias_for_lowest += lowest_vf - min_first_active;
11268 : 1 : bias_for_assumed += assumed_vf - min_first_active;
11269 : : }
11270 : : /* In these calculations the "- 1" converts loop iteration counts
11271 : : back to latch counts. */
11272 : 60606 : if (loop->any_upper_bound)
11273 : : {
11274 : 60606 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11275 : 60606 : loop->nb_iterations_upper_bound
11276 : 60606 : = (final_iter_may_be_partial
11277 : 62059 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11278 : 2906 : lowest_vf) - 1
11279 : 59153 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11280 : 118306 : lowest_vf) - 1);
11281 : 60606 : if (main_vinfo
11282 : : /* Both peeling for alignment and peeling for gaps can end up
11283 : : with the scalar epilogue running for more than VF-1 iterations. */
11284 : 6851 : && !main_vinfo->peeling_for_alignment
11285 : 6803 : && !main_vinfo->peeling_for_gaps)
11286 : : {
11287 : 6640 : unsigned int bound;
11288 : 6640 : poly_uint64 main_iters
11289 : 6640 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11290 : : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11291 : 6640 : main_iters
11292 : 6640 : = upper_bound (main_iters,
11293 : 6640 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11294 : 13280 : if (can_div_away_from_zero_p (main_iters,
11295 : 6640 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11296 : : &bound))
11297 : 6640 : loop->nb_iterations_upper_bound
11298 : 6640 : = wi::umin ((bound_wide_int) (bound - 1),
11299 : 6640 : loop->nb_iterations_upper_bound);
11300 : : }
11301 : : }
11302 : 60606 : if (loop->any_likely_upper_bound)
11303 : 60606 : loop->nb_iterations_likely_upper_bound
11304 : 60606 : = (final_iter_may_be_partial
11305 : 62059 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11306 : 1453 : + bias_for_lowest, lowest_vf) - 1
11307 : 59153 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11308 : 60606 : + bias_for_lowest, lowest_vf) - 1);
11309 : 60606 : if (loop->any_estimate)
11310 : 35120 : loop->nb_iterations_estimate
11311 : 35120 : = (final_iter_may_be_partial
11312 : 35907 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11313 : 1574 : assumed_vf) - 1
11314 : 34333 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11315 : 69453 : assumed_vf) - 1);
11316 : 60606 : scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11317 : : assumed_vf, flat);
11318 : :
11319 : 60606 : if (dump_enabled_p ())
11320 : : {
11321 : 10442 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11322 : : {
11323 : 9071 : dump_printf_loc (MSG_NOTE, vect_location,
11324 : : "LOOP VECTORIZED\n");
11325 : 9071 : if (loop->inner)
11326 : 286 : dump_printf_loc (MSG_NOTE, vect_location,
11327 : : "OUTER LOOP VECTORIZED\n");
11328 : 9071 : dump_printf (MSG_NOTE, "\n");
11329 : : }
11330 : : else
11331 : 1371 : dump_printf_loc (MSG_NOTE, vect_location,
11332 : : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11333 : 1371 : GET_MODE_NAME (loop_vinfo->vector_mode));
11334 : : }
11335 : :
11336 : : /* Loops vectorized with a variable factor won't benefit from
11337 : : unrolling/peeling. */
11338 : 60606 : if (!vf.is_constant ())
11339 : : {
11340 : : loop->unroll = 1;
11341 : : if (dump_enabled_p ())
11342 : : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11343 : : " variable-length vectorization factor\n");
11344 : : }
11345 : :
11346 : : /* When we have unrolled the loop due to a user requested value we should
11347 : : leave it up to the RTL unroll heuristics to determine if it's still worth
11348 : : while to unroll more. */
11349 : 60606 : if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
11350 : 44 : loop->unroll = 0;
11351 : :
11352 : : /* Free SLP instances here because otherwise stmt reference counting
11353 : : won't work. */
11354 : : slp_instance instance;
11355 : 151821 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11356 : 91215 : vect_free_slp_instance (instance);
11357 : 60606 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11358 : : /* Clear-up safelen field since its value is invalid after vectorization
11359 : : since vectorized loop can have loop-carried dependencies. */
11360 : 60606 : loop->safelen = 0;
11361 : :
11362 : 60606 : if (epilogue)
11363 : : {
11364 : : /* Accumulate past advancements made. */
11365 : 6851 : if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
11366 : 89 : advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
11367 : : LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
11368 : : advance);
11369 : 6851 : update_epilogue_loop_vinfo (epilogue, advance);
11370 : :
11371 : 6851 : epilogue->simduid = loop->simduid;
11372 : 6851 : epilogue->force_vectorize = loop->force_vectorize;
11373 : 6851 : epilogue->dont_vectorize = false;
11374 : : }
11375 : :
11376 : 60606 : return epilogue;
11377 : 60606 : }
11378 : :
11379 : : /* The code below is trying to perform simple optimization - revert
11380 : : if-conversion for masked stores, i.e. if the mask of a store is zero
11381 : : do not perform it and all stored value producers also if possible.
11382 : : For example,
11383 : : for (i=0; i<n; i++)
11384 : : if (c[i])
11385 : : {
11386 : : p1[i] += 1;
11387 : : p2[i] = p3[i] +2;
11388 : : }
11389 : : this transformation will produce the following semi-hammock:
11390 : :
11391 : : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11392 : : {
11393 : : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11394 : : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11395 : : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11396 : : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11397 : : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11398 : : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11399 : : }
11400 : : */
11401 : :
11402 : : void
11403 : 495 : optimize_mask_stores (class loop *loop)
11404 : : {
11405 : 495 : basic_block *bbs = get_loop_body (loop);
11406 : 495 : unsigned nbbs = loop->num_nodes;
11407 : 495 : unsigned i;
11408 : 495 : basic_block bb;
11409 : 495 : class loop *bb_loop;
11410 : 495 : gimple_stmt_iterator gsi;
11411 : 495 : gimple *stmt;
11412 : 495 : auto_vec<gimple *> worklist;
11413 : 495 : auto_purge_vect_location sentinel;
11414 : :
11415 : 495 : vect_location = find_loop_location (loop);
11416 : : /* Pick up all masked stores in loop if any. */
11417 : 1980 : for (i = 0; i < nbbs; i++)
11418 : : {
11419 : 990 : bb = bbs[i];
11420 : 16309 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11421 : 14329 : gsi_next (&gsi))
11422 : : {
11423 : 14329 : stmt = gsi_stmt (gsi);
11424 : 14329 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11425 : 649 : worklist.safe_push (stmt);
11426 : : }
11427 : : }
11428 : :
11429 : 495 : free (bbs);
11430 : 495 : if (worklist.is_empty ())
11431 : 68 : return;
11432 : :
11433 : : /* Loop has masked stores. */
11434 : 1059 : while (!worklist.is_empty ())
11435 : : {
11436 : 632 : gimple *last, *last_store;
11437 : 632 : edge e, efalse;
11438 : 632 : tree mask;
11439 : 632 : basic_block store_bb, join_bb;
11440 : 632 : gimple_stmt_iterator gsi_to;
11441 : 632 : tree vdef, new_vdef;
11442 : 632 : gphi *phi;
11443 : 632 : tree vectype;
11444 : 632 : tree zero;
11445 : :
11446 : 632 : last = worklist.pop ();
11447 : 632 : mask = gimple_call_arg (last, 2);
11448 : 632 : bb = gimple_bb (last);
11449 : : /* Create then_bb and if-then structure in CFG, then_bb belongs to
11450 : : the same loop as if_bb. It could be different to LOOP when two
11451 : : level loop-nest is vectorized and mask_store belongs to the inner
11452 : : one. */
11453 : 632 : e = split_block (bb, last);
11454 : 632 : bb_loop = bb->loop_father;
11455 : 632 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11456 : 632 : join_bb = e->dest;
11457 : 632 : store_bb = create_empty_bb (bb);
11458 : 632 : add_bb_to_loop (store_bb, bb_loop);
11459 : 632 : e->flags = EDGE_TRUE_VALUE;
11460 : 632 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11461 : : /* Put STORE_BB to likely part. */
11462 : 632 : efalse->probability = profile_probability::likely ();
11463 : 632 : e->probability = efalse->probability.invert ();
11464 : 632 : store_bb->count = efalse->count ();
11465 : 632 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11466 : 632 : if (dom_info_available_p (CDI_DOMINATORS))
11467 : 632 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11468 : 632 : if (dump_enabled_p ())
11469 : 299 : dump_printf_loc (MSG_NOTE, vect_location,
11470 : : "Create new block %d to sink mask stores.",
11471 : : store_bb->index);
11472 : : /* Create vector comparison with boolean result. */
11473 : 632 : vectype = TREE_TYPE (mask);
11474 : 632 : zero = build_zero_cst (vectype);
11475 : 632 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11476 : 632 : gsi = gsi_last_bb (bb);
11477 : 632 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11478 : : /* Create new PHI node for vdef of the last masked store:
11479 : : .MEM_2 = VDEF <.MEM_1>
11480 : : will be converted to
11481 : : .MEM.3 = VDEF <.MEM_1>
11482 : : and new PHI node will be created in join bb
11483 : : .MEM_2 = PHI <.MEM_1, .MEM_3>
11484 : : */
11485 : 632 : vdef = gimple_vdef (last);
11486 : 632 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
11487 : 632 : gimple_set_vdef (last, new_vdef);
11488 : 632 : phi = create_phi_node (vdef, join_bb);
11489 : 632 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11490 : :
11491 : : /* Put all masked stores with the same mask to STORE_BB if possible. */
11492 : 666 : while (true)
11493 : : {
11494 : 649 : gimple_stmt_iterator gsi_from;
11495 : 649 : gimple *stmt1 = NULL;
11496 : :
11497 : : /* Move masked store to STORE_BB. */
11498 : 649 : last_store = last;
11499 : 649 : gsi = gsi_for_stmt (last);
11500 : 649 : gsi_from = gsi;
11501 : : /* Shift GSI to the previous stmt for further traversal. */
11502 : 649 : gsi_prev (&gsi);
11503 : 649 : gsi_to = gsi_start_bb (store_bb);
11504 : 649 : gsi_move_before (&gsi_from, &gsi_to);
11505 : : /* Setup GSI_TO to the non-empty block start. */
11506 : 649 : gsi_to = gsi_start_bb (store_bb);
11507 : 649 : if (dump_enabled_p ())
11508 : 315 : dump_printf_loc (MSG_NOTE, vect_location,
11509 : : "Move stmt to created bb\n%G", last);
11510 : : /* Move all stored value producers if possible. */
11511 : 4439 : while (!gsi_end_p (gsi))
11512 : : {
11513 : 4438 : tree lhs;
11514 : 4438 : imm_use_iterator imm_iter;
11515 : 4438 : use_operand_p use_p;
11516 : 4438 : bool res;
11517 : :
11518 : : /* Skip debug statements. */
11519 : 4438 : if (is_gimple_debug (gsi_stmt (gsi)))
11520 : : {
11521 : 3 : gsi_prev (&gsi);
11522 : 2777 : continue;
11523 : : }
11524 : 4435 : stmt1 = gsi_stmt (gsi);
11525 : : /* Do not consider statements writing to memory or having
11526 : : volatile operand. */
11527 : 8750 : if (gimple_vdef (stmt1)
11528 : 8750 : || gimple_has_volatile_ops (stmt1))
11529 : : break;
11530 : 4315 : gsi_from = gsi;
11531 : 4315 : gsi_prev (&gsi);
11532 : 4315 : lhs = gimple_get_lhs (stmt1);
11533 : 4315 : if (!lhs)
11534 : : break;
11535 : :
11536 : : /* LHS of vectorized stmt must be SSA_NAME. */
11537 : 4315 : if (TREE_CODE (lhs) != SSA_NAME)
11538 : : break;
11539 : :
11540 : 4315 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11541 : : {
11542 : : /* Remove dead scalar statement. */
11543 : 3067 : if (has_zero_uses (lhs))
11544 : : {
11545 : 2774 : gsi_remove (&gsi_from, true);
11546 : 2774 : release_defs (stmt1);
11547 : 2774 : continue;
11548 : : }
11549 : : }
11550 : :
11551 : : /* Check that LHS does not have uses outside of STORE_BB. */
11552 : 1541 : res = true;
11553 : 4186 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11554 : : {
11555 : 1632 : gimple *use_stmt;
11556 : 1632 : use_stmt = USE_STMT (use_p);
11557 : 1632 : if (is_gimple_debug (use_stmt))
11558 : 0 : continue;
11559 : 1632 : if (gimple_bb (use_stmt) != store_bb)
11560 : : {
11561 : : res = false;
11562 : : break;
11563 : : }
11564 : 1541 : }
11565 : 1541 : if (!res)
11566 : : break;
11567 : :
11568 : 1013 : if (gimple_vuse (stmt1)
11569 : 1448 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
11570 : : break;
11571 : :
11572 : : /* Can move STMT1 to STORE_BB. */
11573 : 1013 : if (dump_enabled_p ())
11574 : 529 : dump_printf_loc (MSG_NOTE, vect_location,
11575 : : "Move stmt to created bb\n%G", stmt1);
11576 : 1013 : gsi_move_before (&gsi_from, &gsi_to);
11577 : : /* Shift GSI_TO for further insertion. */
11578 : 2026 : gsi_prev (&gsi_to);
11579 : : }
11580 : : /* Put other masked stores with the same mask to STORE_BB. */
11581 : 649 : if (worklist.is_empty ()
11582 : 222 : || gimple_call_arg (worklist.last (), 2) != mask
11583 : 17 : || worklist.last () != stmt1)
11584 : : break;
11585 : 17 : last = worklist.pop ();
11586 : 17 : }
11587 : 1264 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11588 : : }
11589 : 495 : }
11590 : :
11591 : : /* Decide whether it is possible to use a zero-based induction variable
11592 : : when vectorizing LOOP_VINFO with partial vectors. If it is, return
11593 : : the value that the induction variable must be able to hold in order
11594 : : to ensure that the rgroups eventually have no active vector elements.
11595 : : Return -1 otherwise. */
11596 : :
11597 : : widest_int
11598 : 31296 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11599 : : {
11600 : 31296 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11601 : 31296 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11602 : 31296 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11603 : :
11604 : : /* Calculate the value that the induction variable must be able
11605 : : to hit in order to ensure that we end the loop with an all-false mask.
11606 : : This involves adding the maximum number of inactive trailing scalar
11607 : : iterations. */
11608 : 31296 : widest_int iv_limit = -1;
11609 : 31296 : if (max_loop_iterations (loop, &iv_limit))
11610 : : {
11611 : 31296 : if (niters_skip)
11612 : : {
11613 : : /* Add the maximum number of skipped iterations to the
11614 : : maximum iteration count. */
11615 : 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
11616 : 0 : iv_limit += wi::to_widest (niters_skip);
11617 : : else
11618 : 0 : iv_limit += max_vf - 1;
11619 : : }
11620 : 31296 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11621 : : /* Make a conservatively-correct assumption. */
11622 : 8 : iv_limit += max_vf - 1;
11623 : :
11624 : : /* IV_LIMIT is the maximum number of latch iterations, which is also
11625 : : the maximum in-range IV value. Round this value down to the previous
11626 : : vector alignment boundary and then add an extra full iteration. */
11627 : 31296 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11628 : 31296 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11629 : : }
11630 : 31296 : return iv_limit;
11631 : : }
11632 : :
11633 : : /* For the given rgroup_controls RGC, check whether an induction variable
11634 : : would ever hit a value that produces a set of all-false masks or zero
11635 : : lengths before wrapping around. Return true if it's possible to wrap
11636 : : around before hitting the desirable value, otherwise return false. */
11637 : :
11638 : : bool
11639 : 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11640 : : {
11641 : 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11642 : :
11643 : 0 : if (iv_limit == -1)
11644 : : return true;
11645 : :
11646 : 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11647 : 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
11648 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11649 : :
11650 : 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11651 : : return true;
11652 : :
11653 : : return false;
11654 : 0 : }
|