Branch data Line data Source code
1 : : /* Loop Vectorization
2 : : Copyright (C) 2003-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : : Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #define INCLUDE_ALGORITHM
23 : : #include "config.h"
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "cfghooks.h"
32 : : #include "tree-pass.h"
33 : : #include "ssa.h"
34 : : #include "optabs-tree.h"
35 : : #include "memmodel.h"
36 : : #include "optabs.h"
37 : : #include "diagnostic-core.h"
38 : : #include "fold-const.h"
39 : : #include "stor-layout.h"
40 : : #include "cfganal.h"
41 : : #include "gimplify.h"
42 : : #include "gimple-iterator.h"
43 : : #include "gimplify-me.h"
44 : : #include "tree-ssa-loop-ivopts.h"
45 : : #include "tree-ssa-loop-manip.h"
46 : : #include "tree-ssa-loop-niter.h"
47 : : #include "tree-ssa-loop.h"
48 : : #include "cfgloop.h"
49 : : #include "tree-scalar-evolution.h"
50 : : #include "tree-vectorizer.h"
51 : : #include "gimple-fold.h"
52 : : #include "cgraph.h"
53 : : #include "tree-cfg.h"
54 : : #include "tree-if-conv.h"
55 : : #include "internal-fn.h"
56 : : #include "tree-vector-builder.h"
57 : : #include "vec-perm-indices.h"
58 : : #include "tree-eh.h"
59 : : #include "case-cfn-macros.h"
60 : : #include "langhooks.h"
61 : : #include "opts.h"
62 : :
63 : : /* Loop Vectorization Pass.
64 : :
65 : : This pass tries to vectorize loops.
66 : :
67 : : For example, the vectorizer transforms the following simple loop:
68 : :
69 : : short a[N]; short b[N]; short c[N]; int i;
70 : :
71 : : for (i=0; i<N; i++){
72 : : a[i] = b[i] + c[i];
73 : : }
74 : :
75 : : as if it was manually vectorized by rewriting the source code into:
76 : :
77 : : typedef int __attribute__((mode(V8HI))) v8hi;
78 : : short a[N]; short b[N]; short c[N]; int i;
79 : : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
80 : : v8hi va, vb, vc;
81 : :
82 : : for (i=0; i<N/8; i++){
83 : : vb = pb[i];
84 : : vc = pc[i];
85 : : va = vb + vc;
86 : : pa[i] = va;
87 : : }
88 : :
89 : : The main entry to this pass is vectorize_loops(), in which
90 : : the vectorizer applies a set of analyses on a given set of loops,
91 : : followed by the actual vectorization transformation for the loops that
92 : : had successfully passed the analysis phase.
93 : : Throughout this pass we make a distinction between two types of
94 : : data: scalars (which are represented by SSA_NAMES), and memory references
95 : : ("data-refs"). These two types of data require different handling both
96 : : during analysis and transformation. The types of data-refs that the
97 : : vectorizer currently supports are ARRAY_REFS which base is an array DECL
98 : : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
99 : : accesses are required to have a simple (consecutive) access pattern.
100 : :
101 : : Analysis phase:
102 : : ===============
103 : : The driver for the analysis phase is vect_analyze_loop().
104 : : It applies a set of analyses, some of which rely on the scalar evolution
105 : : analyzer (scev) developed by Sebastian Pop.
106 : :
107 : : During the analysis phase the vectorizer records some information
108 : : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
109 : : loop, as well as general information about the loop as a whole, which is
110 : : recorded in a "loop_vec_info" struct attached to each loop.
111 : :
112 : : Transformation phase:
113 : : =====================
114 : : The loop transformation phase scans all the stmts in the loop, and
115 : : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
116 : : the loop that needs to be vectorized. It inserts the vector code sequence
117 : : just before the scalar stmt S, and records a pointer to the vector code
118 : : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
119 : : attached to S). This pointer will be used for the vectorization of following
120 : : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
121 : : otherwise, we rely on dead code elimination for removing it.
122 : :
123 : : For example, say stmt S1 was vectorized into stmt VS1:
124 : :
125 : : VS1: vb = px[i];
126 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
127 : : S2: a = b;
128 : :
129 : : To vectorize stmt S2, the vectorizer first finds the stmt that defines
130 : : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
131 : : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
132 : : resulting sequence would be:
133 : :
134 : : VS1: vb = px[i];
135 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
136 : : VS2: va = vb;
137 : : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 : :
139 : : Operands that are not SSA_NAMEs, are data-refs that appear in
140 : : load/store operations (like 'x[i]' in S1), and are handled differently.
141 : :
142 : : Target modeling:
143 : : =================
144 : : Currently the only target specific information that is used is the
145 : : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
146 : : Targets that can support different sizes of vectors, for now will need
147 : : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
148 : : flexibility will be added in the future.
149 : :
150 : : Since we only vectorize operations which vector form can be
151 : : expressed using existing tree codes, to verify that an operation is
152 : : supported, the vectorizer checks the relevant optab at the relevant
153 : : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
154 : : the value found is CODE_FOR_nothing, then there's no target support, and
155 : : we can't vectorize the stmt.
156 : :
157 : : For additional information on this project see:
158 : : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
159 : : */
160 : :
161 : : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
162 : : unsigned *);
163 : : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
164 : : gphi **);
165 : :
166 : :
167 : : /* Function vect_is_simple_iv_evolution.
168 : :
169 : : FORNOW: A simple evolution of an induction variables in the loop is
170 : : considered a polynomial evolution. */
171 : :
172 : : static bool
173 : 669204 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn,
174 : : stmt_vec_info stmt_info)
175 : : {
176 : 669204 : tree init_expr;
177 : 669204 : tree step_expr;
178 : 669204 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
179 : 669204 : basic_block bb;
180 : :
181 : : /* When there is no evolution in this loop, the evolution function
182 : : is not "simple". */
183 : 669204 : if (evolution_part == NULL_TREE)
184 : : return false;
185 : :
186 : : /* When the evolution is a polynomial of degree >= 2
187 : : the evolution function is not "simple". */
188 : 711399 : if (tree_is_chrec (evolution_part))
189 : : return false;
190 : :
191 : 618897 : step_expr = evolution_part;
192 : 618897 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
193 : :
194 : 618897 : if (dump_enabled_p ())
195 : 36695 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
196 : : step_expr, init_expr);
197 : :
198 : 618897 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
199 : 618897 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step_expr;
200 : :
201 : 618897 : if (TREE_CODE (step_expr) != INTEGER_CST
202 : 49715 : && (TREE_CODE (step_expr) != SSA_NAME
203 : 41974 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
204 : 41812 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
205 : 7544 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
206 : 111 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
207 : 111 : || !flag_associative_math)))
208 : 661149 : && (TREE_CODE (step_expr) != REAL_CST
209 : 407 : || !flag_associative_math))
210 : : {
211 : 42195 : if (dump_enabled_p ())
212 : 2729 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
213 : : "step unknown.\n");
214 : 42195 : return false;
215 : : }
216 : :
217 : : return true;
218 : : }
219 : :
220 : : /* Function vect_is_nonlinear_iv_evolution
221 : :
222 : : Only support nonlinear induction for integer type
223 : : 1. neg
224 : : 2. mul by constant
225 : : 3. lshift/rshift by constant.
226 : :
227 : : For neg induction, return a fake step as integer -1. */
228 : : static bool
229 : 90150 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
230 : : gphi* loop_phi_node)
231 : : {
232 : 90150 : tree init_expr, ev_expr, result, op1, op2;
233 : 90150 : gimple* def;
234 : :
235 : 90150 : if (gimple_phi_num_args (loop_phi_node) != 2)
236 : : return false;
237 : :
238 : 90150 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
239 : 90150 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
240 : :
241 : : /* Support nonlinear induction only for integer type. */
242 : 90150 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
243 : : return false;
244 : :
245 : 67259 : result = PHI_RESULT (loop_phi_node);
246 : :
247 : 67259 : if (TREE_CODE (ev_expr) != SSA_NAME
248 : 65111 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
249 : 67259 : || !is_gimple_assign (def))
250 : : return false;
251 : :
252 : 59992 : enum tree_code t_code = gimple_assign_rhs_code (def);
253 : 59992 : tree step;
254 : 59992 : switch (t_code)
255 : : {
256 : 1790 : case NEGATE_EXPR:
257 : 1790 : if (gimple_assign_rhs1 (def) != result)
258 : : return false;
259 : 1790 : step = build_int_cst (TREE_TYPE (init_expr), -1);
260 : 1790 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
261 : 1790 : break;
262 : :
263 : 9721 : case RSHIFT_EXPR:
264 : 9721 : case LSHIFT_EXPR:
265 : 9721 : case MULT_EXPR:
266 : 9721 : op1 = gimple_assign_rhs1 (def);
267 : 9721 : op2 = gimple_assign_rhs2 (def);
268 : 9721 : if (TREE_CODE (op2) != INTEGER_CST
269 : 6179 : || op1 != result)
270 : : return false;
271 : 6048 : step = op2;
272 : 6048 : if (t_code == LSHIFT_EXPR)
273 : 193 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
274 : 5855 : else if (t_code == RSHIFT_EXPR)
275 : 5235 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
276 : : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
277 : : else
278 : 620 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
279 : : break;
280 : :
281 : : default:
282 : : return false;
283 : : }
284 : :
285 : 7838 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = init_expr;
286 : 7838 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = step;
287 : :
288 : 7838 : return true;
289 : : }
290 : :
291 : : /* Returns true if Phi is a first-order recurrence. A first-order
292 : : recurrence is a non-reduction recurrence relation in which the value of
293 : : the recurrence in the current loop iteration equals a value defined in
294 : : the previous iteration. */
295 : :
296 : : static bool
297 : 21138 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
298 : : gphi *phi)
299 : : {
300 : : /* A nested cycle isn't vectorizable as first order recurrence. */
301 : 21138 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
302 : : return false;
303 : :
304 : : /* Ensure the loop latch definition is from within the loop. */
305 : 20972 : edge latch = loop_latch_edge (loop);
306 : 20972 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
307 : 20972 : if (TREE_CODE (ldef) != SSA_NAME
308 : 18541 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
309 : 18513 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
310 : 38240 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
311 : 4017 : return false;
312 : :
313 : 16955 : tree def = gimple_phi_result (phi);
314 : :
315 : : /* Ensure every use_stmt of the phi node is dominated by the latch
316 : : definition. */
317 : 16955 : imm_use_iterator imm_iter;
318 : 16955 : use_operand_p use_p;
319 : 36148 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
320 : 18701 : if (!is_gimple_debug (USE_STMT (use_p))
321 : 36390 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
322 : 10710 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
323 : : USE_STMT (use_p))))
324 : 16463 : return false;
325 : :
326 : : /* First-order recurrence autovectorization needs shuffle vector. */
327 : 492 : tree scalar_type = TREE_TYPE (def);
328 : 492 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
329 : 492 : if (!vectype)
330 : : return false;
331 : :
332 : : return true;
333 : : }
334 : :
335 : : /* Function vect_analyze_scalar_cycles_1.
336 : :
337 : : Examine the cross iteration def-use cycles of scalar variables
338 : : in LOOP. LOOP_VINFO represents the loop that is now being
339 : : considered for vectorization (can be LOOP, or an outer-loop
340 : : enclosing LOOP). SLP indicates there will be some subsequent
341 : : slp analyses or not. */
342 : :
343 : : static void
344 : 326943 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
345 : : {
346 : 326943 : basic_block bb = loop->header;
347 : 326943 : auto_vec<stmt_vec_info, 64> worklist;
348 : 326943 : gphi_iterator gsi;
349 : :
350 : 326943 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
351 : :
352 : : /* First - identify all inductions. Reduction detection assumes that all the
353 : : inductions have been identified, therefore, this order must not be
354 : : changed. */
355 : 1173930 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
356 : : {
357 : 846987 : gphi *phi = gsi.phi ();
358 : 846987 : tree access_fn = NULL;
359 : 846987 : tree def = PHI_RESULT (phi);
360 : 846987 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
361 : :
362 : : /* Skip virtual phi's. The data dependences that are associated with
363 : : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
364 : 1693974 : if (virtual_operand_p (def))
365 : 262453 : continue;
366 : :
367 : : /* Skip already analyzed inner loop PHIs of double reductions. */
368 : 670130 : if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
369 : 926 : continue;
370 : :
371 : 669204 : if (dump_enabled_p ())
372 : 38575 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
373 : : (gimple *) phi);
374 : :
375 : 669204 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
376 : :
377 : : /* Analyze the evolution function. */
378 : 669204 : access_fn = analyze_scalar_evolution (loop, def);
379 : 669204 : if (dump_enabled_p ())
380 : 38575 : dump_printf_loc (MSG_NOTE, vect_location,
381 : : "Access function of PHI: %T\n", access_fn);
382 : 669204 : if (access_fn)
383 : 669204 : STRIP_NOPS (access_fn);
384 : :
385 : 753874 : if ((!access_fn
386 : 669204 : || !vect_is_simple_iv_evolution (loop->num, access_fn, stmt_vinfo)
387 : 576702 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
388 : 10443 : && (TREE_CODE (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo))
389 : : != INTEGER_CST)))
390 : : /* Only handle nonlinear iv for same loop. */
391 : 761712 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
392 : 90150 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo, phi)))
393 : : {
394 : 84670 : worklist.safe_push (stmt_vinfo);
395 : 84670 : continue;
396 : : }
397 : :
398 : 584534 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
399 : : != NULL_TREE);
400 : 584534 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
401 : :
402 : 584534 : if (dump_enabled_p ())
403 : 34058 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
404 : 584534 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
405 : :
406 : : /* Mark if we have a non-linear IV. */
407 : 584534 : LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
408 : 584534 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
409 : : }
410 : :
411 : :
412 : : /* Second - identify all reductions and nested cycles. */
413 : 411613 : while (worklist.length () > 0)
414 : : {
415 : 84670 : stmt_vec_info stmt_vinfo = worklist.pop ();
416 : 84670 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
417 : 84670 : tree def = PHI_RESULT (phi);
418 : :
419 : 84670 : if (dump_enabled_p ())
420 : 4517 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
421 : : (gimple *) phi);
422 : :
423 : 169340 : gcc_assert (!virtual_operand_p (def)
424 : : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
425 : :
426 : 84670 : gphi *double_reduc;
427 : 84670 : stmt_vec_info reduc_stmt_info
428 : 84670 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
429 : 84670 : if (reduc_stmt_info && double_reduc)
430 : : {
431 : 1026 : stmt_vec_info inner_phi_info
432 : 1026 : = loop_vinfo->lookup_stmt (double_reduc);
433 : : /* ??? Pass down flag we're the inner loop of a double reduc. */
434 : 1026 : stmt_vec_info inner_reduc_info
435 : 1026 : = vect_is_simple_reduction (loop_vinfo, inner_phi_info, NULL);
436 : 1026 : if (inner_reduc_info)
437 : : {
438 : 926 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
439 : 926 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
440 : 926 : STMT_VINFO_REDUC_DEF (inner_phi_info) = inner_reduc_info;
441 : 926 : STMT_VINFO_REDUC_DEF (inner_reduc_info) = inner_phi_info;
442 : 926 : if (dump_enabled_p ())
443 : 120 : dump_printf_loc (MSG_NOTE, vect_location,
444 : : "Detected double reduction.\n");
445 : :
446 : 926 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
447 : 926 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
448 : 926 : STMT_VINFO_DEF_TYPE (inner_phi_info) = vect_nested_cycle;
449 : : /* Make it accessible for SLP vectorization. */
450 : 926 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
451 : : }
452 : 100 : else if (dump_enabled_p ())
453 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
454 : : "Unknown def-use cycle pattern.\n");
455 : : }
456 : 83644 : else if (reduc_stmt_info)
457 : : {
458 : 62506 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
459 : : {
460 : 2192 : if (dump_enabled_p ())
461 : 357 : dump_printf_loc (MSG_NOTE, vect_location,
462 : : "Detected vectorizable nested cycle.\n");
463 : :
464 : 2192 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
465 : : }
466 : : else
467 : : {
468 : 60314 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
469 : 60314 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
470 : 60314 : if (dump_enabled_p ())
471 : 3586 : dump_printf_loc (MSG_NOTE, vect_location,
472 : : "Detected reduction.\n");
473 : :
474 : 60314 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
475 : 60314 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
476 : 60314 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
477 : : }
478 : : }
479 : 21138 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
480 : 486 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
481 : : else
482 : 20652 : if (dump_enabled_p ())
483 : 370 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
484 : : "Unknown def-use cycle pattern.\n");
485 : : }
486 : 326943 : }
487 : :
488 : :
489 : : /* Function vect_analyze_scalar_cycles.
490 : :
491 : : Examine the cross iteration def-use cycles of scalar variables, by
492 : : analyzing the loop-header PHIs of scalar variables. Classify each
493 : : cycle as one of the following: invariant, induction, reduction, unknown.
494 : : We do that for the loop represented by LOOP_VINFO, and also to its
495 : : inner-loop, if exists.
496 : : Examples for scalar cycles:
497 : :
498 : : Example1: reduction:
499 : :
500 : : loop1:
501 : : for (i=0; i<N; i++)
502 : : sum += a[i];
503 : :
504 : : Example2: induction:
505 : :
506 : : loop2:
507 : : for (i=0; i<N; i++)
508 : : a[i] = i; */
509 : :
510 : : static void
511 : 321636 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
512 : : {
513 : 321636 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
514 : :
515 : 321636 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
516 : :
517 : : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
518 : : Reductions in such inner-loop therefore have different properties than
519 : : the reductions in the nest that gets vectorized:
520 : : 1. When vectorized, they are executed in the same order as in the original
521 : : scalar loop, so we can't change the order of computation when
522 : : vectorizing them.
523 : : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
524 : : current checks are too strict. */
525 : :
526 : 321636 : if (loop->inner)
527 : 5307 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
528 : 321636 : }
529 : :
530 : : /* Function vect_get_loop_niters.
531 : :
532 : : Determine how many iterations the loop is executed and place it
533 : : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
534 : : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
535 : : niter information holds in ASSUMPTIONS.
536 : :
537 : : Return the loop exit conditions. */
538 : :
539 : :
540 : : static vec<gcond *>
541 : 266509 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
542 : : tree *number_of_iterations, tree *number_of_iterationsm1)
543 : : {
544 : 266509 : auto_vec<edge> exits = get_loop_exit_edges (loop);
545 : 266509 : vec<gcond *> conds;
546 : 533018 : conds.create (exits.length ());
547 : 266509 : class tree_niter_desc niter_desc;
548 : 266509 : tree niter_assumptions, niter, may_be_zero;
549 : :
550 : 266509 : *assumptions = boolean_true_node;
551 : 266509 : *number_of_iterationsm1 = chrec_dont_know;
552 : 266509 : *number_of_iterations = chrec_dont_know;
553 : :
554 : 266509 : DUMP_VECT_SCOPE ("get_loop_niters");
555 : :
556 : 266509 : if (exits.is_empty ())
557 : 0 : return conds;
558 : :
559 : 266509 : if (dump_enabled_p ())
560 : 13974 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
561 : : exits.length ());
562 : :
563 : : edge exit;
564 : : unsigned int i;
565 : 641889 : FOR_EACH_VEC_ELT (exits, i, exit)
566 : : {
567 : 375380 : gcond *cond = get_loop_exit_condition (exit);
568 : 375380 : if (cond)
569 : 365694 : conds.safe_push (cond);
570 : :
571 : 375380 : if (dump_enabled_p ())
572 : 15045 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
573 : :
574 : 375380 : if (exit != main_exit)
575 : 150009 : continue;
576 : :
577 : 266509 : may_be_zero = NULL_TREE;
578 : 266509 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
579 : 266509 : || chrec_contains_undetermined (niter_desc.niter))
580 : 41138 : continue;
581 : :
582 : 225371 : niter_assumptions = niter_desc.assumptions;
583 : 225371 : may_be_zero = niter_desc.may_be_zero;
584 : 225371 : niter = niter_desc.niter;
585 : :
586 : 225371 : if (may_be_zero && integer_zerop (may_be_zero))
587 : : may_be_zero = NULL_TREE;
588 : :
589 : 9736 : if (may_be_zero)
590 : : {
591 : 9736 : if (COMPARISON_CLASS_P (may_be_zero))
592 : : {
593 : : /* Try to combine may_be_zero with assumptions, this can simplify
594 : : computation of niter expression. */
595 : 9736 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
596 : 1083 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
597 : : niter_assumptions,
598 : : fold_build1 (TRUTH_NOT_EXPR,
599 : : boolean_type_node,
600 : : may_be_zero));
601 : : else
602 : 8653 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
603 : : build_int_cst (TREE_TYPE (niter), 0),
604 : : rewrite_to_non_trapping_overflow (niter));
605 : :
606 : 225371 : may_be_zero = NULL_TREE;
607 : : }
608 : 0 : else if (integer_nonzerop (may_be_zero))
609 : : {
610 : 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
611 : 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
612 : 0 : continue;
613 : : }
614 : : else
615 : 0 : continue;
616 : : }
617 : :
618 : : /* Loop assumptions are based off the normal exit. */
619 : 225371 : *assumptions = niter_assumptions;
620 : 225371 : *number_of_iterationsm1 = niter;
621 : :
622 : : /* We want the number of loop header executions which is the number
623 : : of latch executions plus one.
624 : : ??? For UINT_MAX latch executions this number overflows to zero
625 : : for loops like do { n++; } while (n != 0); */
626 : 225371 : if (niter && !chrec_contains_undetermined (niter))
627 : : {
628 : 225371 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
629 : : unshare_expr (niter),
630 : : build_int_cst (TREE_TYPE (niter), 1));
631 : 225371 : if (TREE_CODE (niter) == INTEGER_CST
632 : 121298 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
633 : : {
634 : : /* If we manage to fold niter + 1 into INTEGER_CST even when
635 : : niter is some complex expression, ensure back
636 : : *number_of_iterationsm1 is an INTEGER_CST as well. See
637 : : PR113210. */
638 : 0 : *number_of_iterationsm1
639 : 0 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
640 : : build_minus_one_cst (TREE_TYPE (niter)));
641 : : }
642 : : }
643 : 225371 : *number_of_iterations = niter;
644 : : }
645 : :
646 : 266509 : if (dump_enabled_p ())
647 : 13974 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
648 : :
649 : 266509 : return conds;
650 : 266509 : }
651 : :
652 : : /* Determine the main loop exit for the vectorizer. */
653 : :
654 : : edge
655 : 502583 : vec_init_loop_exit_info (class loop *loop)
656 : : {
657 : : /* Before we begin we must first determine which exit is the main one and
658 : : which are auxilary exits. */
659 : 502583 : auto_vec<edge> exits = get_loop_exit_edges (loop);
660 : 502583 : if (exits.length () == 1)
661 : 319693 : return exits[0];
662 : :
663 : : /* If we have multiple exits we only support counting IV at the moment.
664 : : Analyze all exits and return the last one we can analyze. */
665 : 182890 : class tree_niter_desc niter_desc;
666 : 182890 : edge candidate = NULL;
667 : 1186692 : for (edge exit : exits)
668 : : {
669 : 647888 : if (!get_loop_exit_condition (exit))
670 : 147785 : continue;
671 : :
672 : 500103 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
673 : 500103 : && !chrec_contains_undetermined (niter_desc.niter))
674 : : {
675 : 138285 : tree may_be_zero = niter_desc.may_be_zero;
676 : 138285 : if ((integer_zerop (may_be_zero)
677 : : /* As we are handling may_be_zero that's not false by
678 : : rewriting niter to may_be_zero ? 0 : niter we require
679 : : an empty latch. */
680 : 659054 : || (single_pred_p (loop->latch)
681 : 10837 : && exit->src == single_pred (loop->latch)
682 : 2711 : && (integer_nonzerop (may_be_zero)
683 : 2711 : || COMPARISON_CLASS_P (may_be_zero))))
684 : 140996 : && (!candidate
685 : 7085 : || dominated_by_p (CDI_DOMINATORS, exit->src,
686 : 7085 : candidate->src)))
687 : : candidate = exit;
688 : : }
689 : : }
690 : :
691 : 182890 : return candidate;
692 : 182890 : }
693 : :
694 : : /* Function bb_in_loop_p
695 : :
696 : : Used as predicate for dfs order traversal of the loop bbs. */
697 : :
698 : : static bool
699 : 1319405 : bb_in_loop_p (const_basic_block bb, const void *data)
700 : : {
701 : 1319405 : const class loop *const loop = (const class loop *)data;
702 : 1319405 : if (flow_bb_inside_loop_p (loop, bb))
703 : : return true;
704 : : return false;
705 : : }
706 : :
707 : :
708 : : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
709 : : stmt_vec_info structs for all the stmts in LOOP_IN. */
710 : :
711 : 417326 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
712 : : : vec_info (vec_info::loop, shared),
713 : 417326 : loop (loop_in),
714 : 417326 : num_itersm1 (NULL_TREE),
715 : 417326 : num_iters (NULL_TREE),
716 : 417326 : num_iters_unchanged (NULL_TREE),
717 : 417326 : num_iters_assumptions (NULL_TREE),
718 : 417326 : vector_costs (nullptr),
719 : 417326 : scalar_costs (nullptr),
720 : 417326 : th (0),
721 : 417326 : versioning_threshold (0),
722 : 417326 : vectorization_factor (0),
723 : 417326 : main_loop_edge (nullptr),
724 : 417326 : skip_main_loop_edge (nullptr),
725 : 417326 : skip_this_loop_edge (nullptr),
726 : 417326 : reusable_accumulators (),
727 : 417326 : suggested_unroll_factor (1),
728 : 417326 : max_vectorization_factor (0),
729 : 417326 : mask_skip_niters (NULL_TREE),
730 : 417326 : mask_skip_niters_pfa_offset (NULL_TREE),
731 : 417326 : rgroup_compare_type (NULL_TREE),
732 : 417326 : simd_if_cond (NULL_TREE),
733 : 417326 : partial_vector_style (vect_partial_vectors_none),
734 : 417326 : unaligned_dr (NULL),
735 : 417326 : peeling_for_alignment (0),
736 : 417326 : ptr_mask (0),
737 : 417326 : max_spec_read_amount (0),
738 : 417326 : nonlinear_iv (false),
739 : 417326 : ivexpr_map (NULL),
740 : 417326 : scan_map (NULL),
741 : 417326 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
742 : 417326 : vectorizable (false),
743 : 417326 : can_use_partial_vectors_p (true),
744 : 417326 : must_use_partial_vectors_p (false),
745 : 417326 : using_partial_vectors_p (false),
746 : 417326 : using_decrementing_iv_p (false),
747 : 417326 : using_select_vl_p (false),
748 : 417326 : allow_mutual_alignment (false),
749 : 417326 : partial_load_store_bias (0),
750 : 417326 : peeling_for_gaps (false),
751 : 417326 : peeling_for_niter (false),
752 : 417326 : early_breaks (false),
753 : 417326 : user_unroll (false),
754 : 417326 : no_data_dependencies (false),
755 : 417326 : has_mask_store (false),
756 : 417326 : scalar_loop_scaling (profile_probability::uninitialized ()),
757 : 417326 : scalar_loop (NULL),
758 : 417326 : main_loop_info (NULL),
759 : 417326 : orig_loop_info (NULL),
760 : 417326 : epilogue_vinfo (NULL),
761 : 417326 : drs_advanced_by (NULL_TREE),
762 : 417326 : vec_loop_iv_exit (NULL),
763 : 417326 : vec_epilogue_loop_iv_exit (NULL),
764 : 417326 : scalar_loop_iv_exit (NULL)
765 : : {
766 : : /* CHECKME: We want to visit all BBs before their successors (except for
767 : : latch blocks, for which this assertion wouldn't hold). In the simple
768 : : case of the loop forms we allow, a dfs order of the BBs would the same
769 : : as reversed postorder traversal, so we are safe. */
770 : :
771 : 417326 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
772 : 834652 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
773 : 417326 : loop->num_nodes, loop);
774 : 417326 : gcc_assert (nbbs == loop->num_nodes);
775 : :
776 : 1507099 : for (unsigned int i = 0; i < nbbs; i++)
777 : : {
778 : 1089773 : basic_block bb = bbs[i];
779 : 1089773 : gimple_stmt_iterator si;
780 : :
781 : 2214360 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
782 : : {
783 : 1124587 : gimple *phi = gsi_stmt (si);
784 : 1124587 : gimple_set_uid (phi, 0);
785 : 1124587 : add_stmt (phi);
786 : : }
787 : :
788 : 9298108 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
789 : : {
790 : 7118562 : gimple *stmt = gsi_stmt (si);
791 : 7118562 : gimple_set_uid (stmt, 0);
792 : 7118562 : if (is_gimple_debug (stmt))
793 : 2668423 : continue;
794 : 4450139 : add_stmt (stmt);
795 : : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
796 : : third argument is the #pragma omp simd if (x) condition, when 0,
797 : : loop shouldn't be vectorized, when non-zero constant, it should
798 : : be vectorized normally, otherwise versioned with vectorized loop
799 : : done if the condition is non-zero at runtime. */
800 : 4450139 : if (loop_in->simduid
801 : 43390 : && is_gimple_call (stmt)
802 : 4268 : && gimple_call_internal_p (stmt)
803 : 4141 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
804 : 4137 : && gimple_call_num_args (stmt) >= 3
805 : 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
806 : 4450242 : && (loop_in->simduid
807 : 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
808 : : {
809 : 103 : tree arg = gimple_call_arg (stmt, 2);
810 : 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
811 : 103 : simd_if_cond = arg;
812 : : else
813 : 0 : gcc_assert (integer_nonzerop (arg));
814 : : }
815 : : }
816 : : }
817 : 417326 : }
818 : :
819 : : /* Free all levels of rgroup CONTROLS. */
820 : :
821 : : void
822 : 1085301 : release_vec_loop_controls (vec<rgroup_controls> *controls)
823 : : {
824 : 1085301 : rgroup_controls *rgc;
825 : 1085301 : unsigned int i;
826 : 1102928 : FOR_EACH_VEC_ELT (*controls, i, rgc)
827 : 17627 : rgc->controls.release ();
828 : 1085301 : controls->release ();
829 : 1085301 : }
830 : :
831 : : /* Free all memory used by the _loop_vec_info, as well as all the
832 : : stmt_vec_info structs of all the stmts in the loop. */
833 : :
834 : 417326 : _loop_vec_info::~_loop_vec_info ()
835 : : {
836 : 417326 : free (bbs);
837 : :
838 : 417326 : release_vec_loop_controls (&masks.rgc_vec);
839 : 417326 : release_vec_loop_controls (&lens);
840 : 421138 : delete ivexpr_map;
841 : 417648 : delete scan_map;
842 : 417326 : delete scalar_costs;
843 : 417326 : delete vector_costs;
844 : 560754 : for (auto reduc_info : reduc_infos)
845 : 139572 : delete reduc_info;
846 : :
847 : : /* When we release an epiloge vinfo that we do not intend to use
848 : : avoid clearing AUX of the main loop which should continue to
849 : : point to the main loop vinfo since otherwise we'll leak that. */
850 : 417326 : if (loop->aux == this)
851 : 60469 : loop->aux = NULL;
852 : 834652 : }
853 : :
854 : : /* Return an invariant or register for EXPR and emit necessary
855 : : computations in the LOOP_VINFO loop preheader. */
856 : :
857 : : tree
858 : 19449 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
859 : : {
860 : 19449 : if (is_gimple_reg (expr)
861 : 19449 : || is_gimple_min_invariant (expr))
862 : 6447 : return expr;
863 : :
864 : 13002 : if (! loop_vinfo->ivexpr_map)
865 : 3812 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
866 : 13002 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
867 : 13002 : if (! cached)
868 : : {
869 : 8494 : gimple_seq stmts = NULL;
870 : 8494 : cached = force_gimple_operand (unshare_expr (expr),
871 : : &stmts, true, NULL_TREE);
872 : 8494 : if (stmts)
873 : : {
874 : 8354 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
875 : 8354 : gsi_insert_seq_on_edge_immediate (e, stmts);
876 : : }
877 : : }
878 : 13002 : return cached;
879 : : }
880 : :
881 : : /* Return true if we can use CMP_TYPE as the comparison type to produce
882 : : all masks required to mask LOOP_VINFO. */
883 : :
884 : : static bool
885 : 79097 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
886 : : {
887 : 79097 : rgroup_controls *rgm;
888 : 79097 : unsigned int i;
889 : 91702 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
890 : 91702 : if (rgm->type != NULL_TREE
891 : 91702 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
892 : : cmp_type, rgm->type,
893 : : OPTIMIZE_FOR_SPEED))
894 : : return false;
895 : : return true;
896 : : }
897 : :
898 : : /* Calculate the maximum number of scalars per iteration for every
899 : : rgroup in LOOP_VINFO. */
900 : :
901 : : static unsigned int
902 : 16820 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
903 : : {
904 : 16820 : unsigned int res = 1;
905 : 16820 : unsigned int i;
906 : 16820 : rgroup_controls *rgm;
907 : 41181 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
908 : 24361 : res = MAX (res, rgm->max_nscalars_per_iter);
909 : 16820 : return res;
910 : : }
911 : :
912 : : /* Calculate the minimum precision necessary to represent:
913 : :
914 : : MAX_NITERS * FACTOR
915 : :
916 : : as an unsigned integer, where MAX_NITERS is the maximum number of
917 : : loop header iterations for the original scalar form of LOOP_VINFO. */
918 : :
919 : : static unsigned
920 : 16820 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
921 : : {
922 : 16820 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
923 : :
924 : : /* Get the maximum number of iterations that is representable
925 : : in the counter type. */
926 : 16820 : tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
927 : 16820 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
928 : :
929 : : /* Get a more refined estimate for the number of iterations. */
930 : 16820 : widest_int max_back_edges;
931 : 16820 : if (max_loop_iterations (loop, &max_back_edges))
932 : 16820 : max_ni = wi::smin (max_ni, max_back_edges + 1);
933 : :
934 : : /* Work out how many bits we need to represent the limit. */
935 : 16820 : return wi::min_precision (max_ni * factor, UNSIGNED);
936 : 16820 : }
937 : :
938 : : /* True if the loop needs peeling or partial vectors when vectorized. */
939 : :
940 : : static bool
941 : 113213 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
942 : : {
943 : 113213 : unsigned HOST_WIDE_INT const_vf;
944 : :
945 : 113213 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
946 : : return true;
947 : :
948 : 12385 : loop_vec_info main_loop_vinfo
949 : 111969 : = (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
950 : 111969 : ? LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) : loop_vinfo);
951 : 111969 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
952 : 52976 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo) >= 0)
953 : : {
954 : : /* Work out the (constant) number of iterations that need to be
955 : : peeled for reasons other than niters. */
956 : 52933 : unsigned int peel_niter
957 : : = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
958 : 52933 : return !multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
959 : 52933 : LOOP_VINFO_VECT_FACTOR (loop_vinfo));
960 : : }
961 : :
962 : 59036 : if (!LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo)
963 : 59036 : && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf))
964 : : {
965 : : /* When the number of iterations is a multiple of the vectorization
966 : : factor and we are not doing prologue or forced epilogue peeling
967 : : the epilogue isn't necessary. */
968 : 58755 : if (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
969 : 117510 : >= (unsigned) exact_log2 (const_vf))
970 : : return false;
971 : : }
972 : :
973 : : return true;
974 : : }
975 : :
976 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
977 : : whether we can actually generate the masks required. Return true if so,
978 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
979 : :
980 : : static bool
981 : 16820 : vect_verify_full_masking (loop_vec_info loop_vinfo)
982 : : {
983 : 16820 : unsigned int min_ni_width;
984 : :
985 : : /* Use a normal loop if there are no statements that need masking.
986 : : This only happens in rare degenerate cases: it means that the loop
987 : : has no loads, no stores, and no live-out values. */
988 : 16820 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
989 : : return false;
990 : :
991 : : /* Produce the rgroup controls. */
992 : 67558 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
993 : : {
994 : 25369 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
995 : 25369 : tree vectype = mask.first;
996 : 25369 : unsigned nvectors = mask.second;
997 : :
998 : 33918 : if (masks->rgc_vec.length () < nvectors)
999 : 18591 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1000 : 25369 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1001 : : /* The number of scalars per iteration and the number of vectors are
1002 : : both compile-time constants. */
1003 : 25369 : unsigned int nscalars_per_iter
1004 : 25369 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1005 : 25369 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1006 : :
1007 : 25369 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1008 : : {
1009 : 20105 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1010 : 20105 : rgm->type = truth_type_for (vectype);
1011 : 20105 : rgm->factor = 1;
1012 : : }
1013 : : }
1014 : :
1015 : 16820 : unsigned int max_nscalars_per_iter
1016 : 16820 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1017 : :
1018 : : /* Work out how many bits we need to represent the limit. */
1019 : 16820 : min_ni_width
1020 : 16820 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1021 : :
1022 : : /* Find a scalar mode for which WHILE_ULT is supported. */
1023 : 16820 : opt_scalar_int_mode cmp_mode_iter;
1024 : 16820 : tree cmp_type = NULL_TREE;
1025 : 16820 : tree iv_type = NULL_TREE;
1026 : 16820 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1027 : 16820 : unsigned int iv_precision = UINT_MAX;
1028 : :
1029 : 16820 : if (iv_limit != -1)
1030 : 16820 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1031 : : UNSIGNED);
1032 : :
1033 : 134560 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1034 : : {
1035 : 117740 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1036 : 117740 : if (cmp_bits >= min_ni_width
1037 : 117740 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1038 : : {
1039 : 79097 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1040 : 79097 : if (this_type
1041 : 79097 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1042 : : {
1043 : : /* Although we could stop as soon as we find a valid mode,
1044 : : there are at least two reasons why that's not always the
1045 : : best choice:
1046 : :
1047 : : - An IV that's Pmode or wider is more likely to be reusable
1048 : : in address calculations than an IV that's narrower than
1049 : : Pmode.
1050 : :
1051 : : - Doing the comparison in IV_PRECISION or wider allows
1052 : : a natural 0-based IV, whereas using a narrower comparison
1053 : : type requires mitigations against wrap-around.
1054 : :
1055 : : Conversely, if the IV limit is variable, doing the comparison
1056 : : in a wider type than the original type can introduce
1057 : : unnecessary extensions, so picking the widest valid mode
1058 : : is not always a good choice either.
1059 : :
1060 : : Here we prefer the first IV type that's Pmode or wider,
1061 : : and the first comparison type that's IV_PRECISION or wider.
1062 : : (The comparison type must be no wider than the IV type,
1063 : : to avoid extensions in the vector loop.)
1064 : :
1065 : : ??? We might want to try continuing beyond Pmode for ILP32
1066 : : targets if CMP_BITS < IV_PRECISION. */
1067 : 0 : iv_type = this_type;
1068 : 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1069 : : cmp_type = this_type;
1070 : 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1071 : : break;
1072 : : }
1073 : : }
1074 : : }
1075 : :
1076 : 16820 : if (!cmp_type)
1077 : : {
1078 : 16820 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1079 : 16820 : return false;
1080 : : }
1081 : :
1082 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1083 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1084 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1085 : 0 : return true;
1086 : 16820 : }
1087 : :
1088 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1089 : : whether we can actually generate AVX512 style masks. Return true if so,
1090 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1091 : :
1092 : : static bool
1093 : 16820 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1094 : : {
1095 : : /* Produce differently organized rgc_vec and differently check
1096 : : we can produce masks. */
1097 : :
1098 : : /* Use a normal loop if there are no statements that need masking.
1099 : : This only happens in rare degenerate cases: it means that the loop
1100 : : has no loads, no stores, and no live-out values. */
1101 : 16820 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1102 : : return false;
1103 : :
1104 : : /* For the decrementing IV we need to represent all values in
1105 : : [0, niter + niter_skip] where niter_skip is the elements we
1106 : : skip in the first iteration for prologue peeling. */
1107 : 16820 : tree iv_type = NULL_TREE;
1108 : 16820 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1109 : 16820 : unsigned int iv_precision = UINT_MAX;
1110 : 16820 : if (iv_limit != -1)
1111 : 16820 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1112 : :
1113 : : /* First compute the type for the IV we use to track the remaining
1114 : : scalar iterations. */
1115 : 16820 : opt_scalar_int_mode cmp_mode_iter;
1116 : 21806 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1117 : : {
1118 : 21806 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1119 : 21806 : if (cmp_bits >= iv_precision
1120 : 21806 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1121 : : {
1122 : 16820 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1123 : 16820 : if (iv_type)
1124 : : break;
1125 : : }
1126 : : }
1127 : 16820 : if (!iv_type)
1128 : : return false;
1129 : :
1130 : : /* Produce the rgroup controls. */
1131 : 67558 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1132 : : {
1133 : 25369 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1134 : 25369 : tree vectype = mask.first;
1135 : 25369 : unsigned nvectors = mask.second;
1136 : :
1137 : : /* The number of scalars per iteration and the number of vectors are
1138 : : both compile-time constants. */
1139 : 25369 : unsigned int nscalars_per_iter
1140 : 25369 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1141 : 25369 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1142 : :
1143 : : /* We index the rgroup_controls vector with nscalars_per_iter
1144 : : which we keep constant and instead have a varying nvectors,
1145 : : remembering the vector mask with the fewest nV. */
1146 : 33918 : if (masks->rgc_vec.length () < nscalars_per_iter)
1147 : 16873 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1148 : 25369 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1149 : :
1150 : 25369 : if (!rgm->type || rgm->factor > nvectors)
1151 : : {
1152 : 18388 : rgm->type = truth_type_for (vectype);
1153 : 18388 : rgm->compare_type = NULL_TREE;
1154 : 18388 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1155 : 18388 : rgm->factor = nvectors;
1156 : 18388 : rgm->bias_adjusted_ctrl = NULL_TREE;
1157 : : }
1158 : : }
1159 : :
1160 : : /* There is no fixed compare type we are going to use but we have to
1161 : : be able to get at one for each mask group. */
1162 : 16820 : unsigned int min_ni_width
1163 : 16820 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1164 : :
1165 : 16820 : bool ok = true;
1166 : 64364 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1167 : : {
1168 : 17585 : tree mask_type = rgc.type;
1169 : 17585 : if (!mask_type)
1170 : 689 : continue;
1171 : :
1172 : : /* For now vect_get_loop_mask only supports integer mode masks
1173 : : when we need to split it. */
1174 : 16896 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1175 : 16896 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1176 : : {
1177 : : ok = false;
1178 : : break;
1179 : : }
1180 : :
1181 : : /* If iv_type is usable as compare type use that - we can elide the
1182 : : saturation in that case. */
1183 : 13219 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1184 : : {
1185 : 13219 : tree cmp_vectype
1186 : 13219 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1187 : 13219 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1188 : 4696 : rgc.compare_type = cmp_vectype;
1189 : : }
1190 : 13219 : if (!rgc.compare_type)
1191 : 24920 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1192 : : {
1193 : 24916 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1194 : 24916 : if (cmp_bits >= min_ni_width
1195 : 24916 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1196 : : {
1197 : 24904 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1198 : 24904 : if (!cmp_type)
1199 : 0 : continue;
1200 : :
1201 : : /* Check whether we can produce the mask with cmp_type. */
1202 : 24904 : tree cmp_vectype
1203 : 24904 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1204 : 24904 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1205 : : {
1206 : 8519 : rgc.compare_type = cmp_vectype;
1207 : 8519 : break;
1208 : : }
1209 : : }
1210 : : }
1211 : 13219 : if (!rgc.compare_type)
1212 : : {
1213 : : ok = false;
1214 : : break;
1215 : : }
1216 : : }
1217 : 16820 : if (!ok)
1218 : : {
1219 : 3681 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1220 : 3681 : return false;
1221 : : }
1222 : :
1223 : 13139 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1224 : 13139 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1225 : 13139 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1226 : 13139 : return true;
1227 : 16820 : }
1228 : :
1229 : : /* Check whether we can use vector access with length based on precison
1230 : : comparison. So far, to keep it simple, we only allow the case that the
1231 : : precision of the target supported length is larger than the precision
1232 : : required by loop niters. */
1233 : :
1234 : : static bool
1235 : 6 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1236 : : {
1237 : 6 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1238 : : return false;
1239 : :
1240 : 0 : if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
1241 : : return false;
1242 : :
1243 : 0 : machine_mode len_load_mode, len_store_mode;
1244 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1245 : 0 : .exists (&len_load_mode))
1246 : 0 : return false;
1247 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1248 : 0 : .exists (&len_store_mode))
1249 : 0 : return false;
1250 : :
1251 : 0 : signed char partial_load_bias = internal_len_load_store_bias
1252 : 0 : (IFN_LEN_LOAD, len_load_mode);
1253 : :
1254 : 0 : signed char partial_store_bias = internal_len_load_store_bias
1255 : 0 : (IFN_LEN_STORE, len_store_mode);
1256 : :
1257 : 0 : gcc_assert (partial_load_bias == partial_store_bias);
1258 : :
1259 : 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1260 : : return false;
1261 : :
1262 : : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1263 : : len_loads with a length of zero. In order to avoid that we prohibit
1264 : : more than one loop length here. */
1265 : 0 : if (partial_load_bias == -1
1266 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1267 : : return false;
1268 : :
1269 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1270 : :
1271 : 0 : unsigned int max_nitems_per_iter = 1;
1272 : 0 : unsigned int i;
1273 : 0 : rgroup_controls *rgl;
1274 : : /* Find the maximum number of items per iteration for every rgroup. */
1275 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1276 : : {
1277 : 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1278 : 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1279 : : }
1280 : :
1281 : : /* Work out how many bits we need to represent the length limit. */
1282 : 0 : unsigned int min_ni_prec
1283 : 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1284 : :
1285 : : /* Now use the maximum of below precisions for one suitable IV type:
1286 : : - the IV's natural precision
1287 : : - the precision needed to hold: the maximum number of scalar
1288 : : iterations multiplied by the scale factor (min_ni_prec above)
1289 : : - the Pmode precision
1290 : :
1291 : : If min_ni_prec is less than the precision of the current niters,
1292 : : we perfer to still use the niters type. Prefer to use Pmode and
1293 : : wider IV to avoid narrow conversions. */
1294 : :
1295 : 0 : unsigned int ni_prec
1296 : 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1297 : 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1298 : 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1299 : :
1300 : 0 : tree iv_type = NULL_TREE;
1301 : 0 : opt_scalar_int_mode tmode_iter;
1302 : 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1303 : : {
1304 : 0 : scalar_mode tmode = tmode_iter.require ();
1305 : 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1306 : :
1307 : : /* ??? Do we really want to construct one IV whose precision exceeds
1308 : : BITS_PER_WORD? */
1309 : 0 : if (tbits > BITS_PER_WORD)
1310 : : break;
1311 : :
1312 : : /* Find the first available standard integral type. */
1313 : 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1314 : : {
1315 : 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1316 : 0 : break;
1317 : : }
1318 : : }
1319 : :
1320 : 0 : if (!iv_type)
1321 : : {
1322 : 0 : if (dump_enabled_p ())
1323 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1324 : : "can't vectorize with length-based partial vectors"
1325 : : " because there is no suitable iv type.\n");
1326 : 0 : return false;
1327 : : }
1328 : :
1329 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1330 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1331 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1332 : :
1333 : 0 : return true;
1334 : : }
1335 : :
1336 : : /* Calculate the cost of one scalar iteration of the loop. */
1337 : : static void
1338 : 286496 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1339 : : {
1340 : 286496 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1341 : 286496 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1342 : 286496 : int nbbs = loop->num_nodes, factor;
1343 : 286496 : int innerloop_iters, i;
1344 : :
1345 : 286496 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1346 : :
1347 : : /* Gather costs for statements in the scalar loop. */
1348 : :
1349 : : /* FORNOW. */
1350 : 286496 : innerloop_iters = 1;
1351 : 286496 : if (loop->inner)
1352 : 1261 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1353 : :
1354 : 1019469 : for (i = 0; i < nbbs; i++)
1355 : : {
1356 : 732973 : gimple_stmt_iterator si;
1357 : 732973 : basic_block bb = bbs[i];
1358 : :
1359 : 732973 : if (bb->loop_father == loop->inner)
1360 : : factor = innerloop_iters;
1361 : : else
1362 : 730451 : factor = 1;
1363 : :
1364 : 5839183 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1365 : : {
1366 : 4373237 : gimple *stmt = gsi_stmt (si);
1367 : 4373237 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1368 : :
1369 : 4373237 : if (!is_gimple_assign (stmt)
1370 : : && !is_gimple_call (stmt)
1371 : : && !is_a<gcond *> (stmt))
1372 : 1585959 : continue;
1373 : :
1374 : : /* Skip stmts that are not vectorized inside the loop. */
1375 : 2787278 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1376 : 2787278 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1377 : 1339100 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1378 : 50 : || !VECTORIZABLE_CYCLE_DEF
1379 : : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1380 : 1339100 : continue;
1381 : :
1382 : 1448178 : vect_cost_for_stmt kind;
1383 : 1448178 : if (STMT_VINFO_DATA_REF (stmt_info))
1384 : : {
1385 : 679634 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1386 : : kind = scalar_load;
1387 : : else
1388 : 242168 : kind = scalar_store;
1389 : : }
1390 : 768544 : else if (vect_nop_conversion_p (stmt_info))
1391 : 41001 : continue;
1392 : : else
1393 : : kind = scalar_stmt;
1394 : :
1395 : : /* We are using vect_prologue here to avoid scaling twice
1396 : : by the inner loop factor. */
1397 : 1407177 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1398 : : factor, kind, stmt_info, 0, vect_prologue);
1399 : : }
1400 : : }
1401 : :
1402 : : /* Now accumulate cost. */
1403 : 286496 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1404 : 286496 : add_stmt_costs (loop_vinfo->scalar_costs,
1405 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1406 : 286496 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1407 : 286496 : }
1408 : :
1409 : : /* Function vect_analyze_loop_form.
1410 : :
1411 : : Verify that certain CFG restrictions hold, including:
1412 : : - the loop has a pre-header
1413 : : - the loop has a single entry
1414 : : - nested loops can have only a single exit.
1415 : : - the loop exit condition is simple enough
1416 : : - the number of iterations can be analyzed, i.e, a countable loop. The
1417 : : niter could be analyzed under some assumptions. */
1418 : :
1419 : : opt_result
1420 : 468574 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1421 : : vect_loop_form_info *info)
1422 : : {
1423 : 468574 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1424 : :
1425 : 468574 : edge exit_e = vec_init_loop_exit_info (loop);
1426 : 468574 : if (!exit_e)
1427 : 60145 : return opt_result::failure_at (vect_location,
1428 : : "not vectorized:"
1429 : : " could not determine main exit from"
1430 : : " loop with multiple exits.\n");
1431 : 408429 : if (loop_vectorized_call)
1432 : : {
1433 : 26286 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1434 : 26286 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1435 : 26286 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1436 : 26286 : if (!scalar_exit_e)
1437 : 0 : return opt_result::failure_at (vect_location,
1438 : : "not vectorized:"
1439 : : " could not determine main exit from"
1440 : : " loop with multiple exits.\n");
1441 : : }
1442 : :
1443 : 408429 : info->loop_exit = exit_e;
1444 : 408429 : if (dump_enabled_p ())
1445 : 15288 : dump_printf_loc (MSG_NOTE, vect_location,
1446 : : "using as main loop exit: %d -> %d [AUX: %p]\n",
1447 : 15288 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1448 : :
1449 : : /* Check if we have any control flow that doesn't leave the loop. */
1450 : 408429 : basic_block *bbs = get_loop_body (loop);
1451 : 1367008 : for (unsigned i = 0; i < loop->num_nodes; i++)
1452 : 1067182 : if (EDGE_COUNT (bbs[i]->succs) != 1
1453 : 1067182 : && (EDGE_COUNT (bbs[i]->succs) != 2
1454 : 630986 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1455 : : {
1456 : 108603 : free (bbs);
1457 : 108603 : return opt_result::failure_at (vect_location,
1458 : : "not vectorized:"
1459 : : " unsupported control flow in loop.\n");
1460 : : }
1461 : 299826 : free (bbs);
1462 : :
1463 : : /* Different restrictions apply when we are considering an inner-most loop,
1464 : : vs. an outer (nested) loop.
1465 : : (FORNOW. May want to relax some of these restrictions in the future). */
1466 : :
1467 : 299826 : info->inner_loop_cond = NULL;
1468 : 299826 : if (!loop->inner)
1469 : : {
1470 : : /* Inner-most loop. */
1471 : :
1472 : 278222 : if (empty_block_p (loop->header))
1473 : 3 : return opt_result::failure_at (vect_location,
1474 : : "not vectorized: empty loop.\n");
1475 : : }
1476 : : else
1477 : : {
1478 : 21604 : class loop *innerloop = loop->inner;
1479 : 21604 : edge entryedge;
1480 : :
1481 : : /* Nested loop. We currently require that the loop is doubly-nested,
1482 : : contains a single inner loop with a single exit to the block
1483 : : with the single exit condition in the outer loop.
1484 : : Vectorizable outer-loops look like this:
1485 : :
1486 : : (pre-header)
1487 : : |
1488 : : header <---+
1489 : : | |
1490 : : inner-loop |
1491 : : | |
1492 : : tail ------+
1493 : : |
1494 : : (exit-bb)
1495 : :
1496 : : The inner-loop also has the properties expected of inner-most loops
1497 : : as described above. */
1498 : :
1499 : 21604 : if ((loop->inner)->inner || (loop->inner)->next)
1500 : 2971 : return opt_result::failure_at (vect_location,
1501 : : "not vectorized:"
1502 : : " multiple nested loops.\n");
1503 : :
1504 : 18633 : entryedge = loop_preheader_edge (innerloop);
1505 : 18633 : if (entryedge->src != loop->header
1506 : 18283 : || !single_exit (innerloop)
1507 : 29941 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1508 : 7610 : return opt_result::failure_at (vect_location,
1509 : : "not vectorized:"
1510 : : " unsupported outerloop form.\n");
1511 : :
1512 : : /* Analyze the inner-loop. */
1513 : 11023 : vect_loop_form_info inner;
1514 : 11023 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1515 : 11023 : if (!res)
1516 : : {
1517 : 1272 : if (dump_enabled_p ())
1518 : 5 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1519 : : "not vectorized: Bad inner loop.\n");
1520 : 1272 : return res;
1521 : : }
1522 : :
1523 : : /* Don't support analyzing niter under assumptions for inner
1524 : : loop. */
1525 : 9751 : if (!integer_onep (inner.assumptions))
1526 : 283 : return opt_result::failure_at (vect_location,
1527 : : "not vectorized: Bad inner loop.\n");
1528 : :
1529 : 9468 : if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1530 : 1086 : return opt_result::failure_at (vect_location,
1531 : : "not vectorized: inner-loop count not"
1532 : : " invariant.\n");
1533 : :
1534 : 8382 : if (dump_enabled_p ())
1535 : 949 : dump_printf_loc (MSG_NOTE, vect_location,
1536 : : "Considering outer-loop vectorization.\n");
1537 : 8382 : info->inner_loop_cond = inner.conds[0];
1538 : 11023 : }
1539 : :
1540 : 286601 : if (EDGE_COUNT (loop->header->preds) != 2)
1541 : 0 : return opt_result::failure_at (vect_location,
1542 : : "not vectorized:"
1543 : : " too many incoming edges.\n");
1544 : :
1545 : : /* We assume that the latch is empty. */
1546 : 286601 : basic_block latch = loop->latch;
1547 : 286601 : do
1548 : : {
1549 : 286601 : if (!empty_block_p (latch)
1550 : 286601 : || !gimple_seq_empty_p (phi_nodes (latch)))
1551 : 20048 : return opt_result::failure_at (vect_location,
1552 : : "not vectorized: latch block not "
1553 : : "empty.\n");
1554 : 266553 : latch = single_pred (latch);
1555 : : }
1556 : 533106 : while (single_succ_p (latch));
1557 : :
1558 : : /* Make sure there is no abnormal exit. */
1559 : 266553 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1560 : 1175050 : for (edge e : exits)
1561 : : {
1562 : 375435 : if (e->flags & EDGE_ABNORMAL)
1563 : 44 : return opt_result::failure_at (vect_location,
1564 : : "not vectorized:"
1565 : : " abnormal loop exit edge.\n");
1566 : : }
1567 : :
1568 : 266509 : info->conds
1569 : 266509 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1570 : : &info->number_of_iterations,
1571 : 266509 : &info->number_of_iterationsm1);
1572 : 266509 : if (info->conds.is_empty ())
1573 : 36 : return opt_result::failure_at
1574 : 36 : (vect_location,
1575 : : "not vectorized: complicated exit condition.\n");
1576 : :
1577 : : /* Determine what the primary and alternate exit conds are. */
1578 : 632167 : for (unsigned i = 0; i < info->conds.length (); i++)
1579 : : {
1580 : 365694 : gcond *cond = info->conds[i];
1581 : 365694 : if (exit_e->src == gimple_bb (cond))
1582 : 266473 : std::swap (info->conds[0], info->conds[i]);
1583 : : }
1584 : :
1585 : 266473 : if (integer_zerop (info->assumptions)
1586 : 266473 : || !info->number_of_iterations
1587 : 532946 : || chrec_contains_undetermined (info->number_of_iterations))
1588 : 41102 : return opt_result::failure_at
1589 : 41102 : (info->conds[0],
1590 : : "not vectorized: number of iterations cannot be computed.\n");
1591 : :
1592 : 225371 : if (integer_zerop (info->number_of_iterations))
1593 : 14 : return opt_result::failure_at
1594 : 14 : (info->conds[0],
1595 : : "not vectorized: number of iterations = 0.\n");
1596 : :
1597 : 225357 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1598 : 121277 : && tree_to_shwi (info->number_of_iterations) > 0))
1599 : : {
1600 : 104080 : if (dump_enabled_p ())
1601 : : {
1602 : 2413 : dump_printf_loc (MSG_NOTE, vect_location,
1603 : : "Symbolic number of iterations is ");
1604 : 2413 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1605 : 2413 : dump_printf (MSG_NOTE, "\n");
1606 : : }
1607 : : }
1608 : :
1609 : 225357 : if (!integer_onep (info->assumptions))
1610 : : {
1611 : 10359 : if (dump_enabled_p ())
1612 : : {
1613 : 63 : dump_printf_loc (MSG_NOTE, vect_location,
1614 : : "Loop to be versioned with niter assumption ");
1615 : 63 : dump_generic_expr (MSG_NOTE, TDF_SLIM, info->assumptions);
1616 : 63 : dump_printf (MSG_NOTE, "\n");
1617 : : }
1618 : : }
1619 : :
1620 : 225357 : return opt_result::success ();
1621 : 266553 : }
1622 : :
1623 : : /* Create a loop_vec_info for LOOP with SHARED and the
1624 : : vect_analyze_loop_form result. */
1625 : :
1626 : : loop_vec_info
1627 : 417326 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1628 : : const vect_loop_form_info *info,
1629 : : loop_vec_info orig_loop_info)
1630 : : {
1631 : 417326 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1632 : 417326 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1633 : 417326 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1634 : 417326 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1635 : 417326 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1636 : 417326 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1637 : 171 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1638 : 171 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1639 : : else
1640 : 417155 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1641 : : /* Also record the assumptions for versioning. */
1642 : 417326 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1643 : 20617 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1644 : :
1645 : 1889286 : for (gcond *cond : info->conds)
1646 : : {
1647 : 637308 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1648 : : /* Mark the statement as a condition. */
1649 : 637308 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1650 : : }
1651 : :
1652 : 637308 : for (unsigned i = 1; i < info->conds.length (); i ++)
1653 : 219982 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1654 : 417326 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1655 : :
1656 : 417326 : LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1657 : :
1658 : : /* Check to see if we're vectorizing multiple exits. */
1659 : 417326 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1660 : 417326 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1661 : :
1662 : 417326 : if (info->inner_loop_cond)
1663 : : {
1664 : : /* If we have an estimate on the number of iterations of the inner
1665 : : loop use that to limit the scale for costing, otherwise use
1666 : : --param vect-inner-loop-cost-factor literally. */
1667 : 8489 : widest_int nit;
1668 : 8489 : if (estimated_stmt_executions (loop->inner, &nit))
1669 : 7234 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1670 : 7234 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1671 : 8489 : }
1672 : :
1673 : 417326 : return loop_vinfo;
1674 : : }
1675 : :
1676 : :
1677 : :
1678 : : /* Return true if we know that the iteration count is smaller than the
1679 : : vectorization factor. Return false if it isn't, or if we can't be sure
1680 : : either way. */
1681 : :
1682 : : static bool
1683 : 112418 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1684 : : {
1685 : 112418 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1686 : :
1687 : 112418 : HOST_WIDE_INT max_niter;
1688 : 112418 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1689 : 53183 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1690 : : else
1691 : 59235 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1692 : :
1693 : 112418 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1694 : 10572 : return true;
1695 : :
1696 : : return false;
1697 : : }
1698 : :
1699 : : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1700 : : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1701 : : definitely no, or -1 if it's worth retrying. */
1702 : :
1703 : : static int
1704 : 112426 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1705 : : unsigned *suggested_unroll_factor)
1706 : : {
1707 : 112426 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1708 : 112426 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1709 : :
1710 : : /* Only loops that can handle partially-populated vectors can have iteration
1711 : : counts less than the vectorization factor. */
1712 : 112426 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
1713 : 112426 : && vect_known_niters_smaller_than_vf (loop_vinfo))
1714 : : {
1715 : 10562 : if (dump_enabled_p ())
1716 : 227 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1717 : : "not vectorized: iteration count smaller than "
1718 : : "vectorization factor.\n");
1719 : 10562 : return 0;
1720 : : }
1721 : :
1722 : : /* If we know the number of iterations we can do better, for the
1723 : : epilogue we can also decide whether the main loop leaves us
1724 : : with enough iterations, prefering a smaller vector epilog then
1725 : : also possibly used for the case we skip the vector loop. */
1726 : 101864 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1727 : : {
1728 : 43847 : widest_int scalar_niters
1729 : 43847 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
1730 : 43847 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1731 : : {
1732 : 2642 : loop_vec_info orig_loop_vinfo
1733 : : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1734 : 2642 : loop_vec_info main_loop_vinfo
1735 : : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
1736 : 2642 : unsigned lowest_vf
1737 : 2642 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
1738 : 2642 : int prolog_peeling = 0;
1739 : 2642 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
1740 : 2642 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
1741 : 2642 : if (prolog_peeling >= 0
1742 : 2642 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
1743 : : lowest_vf))
1744 : : {
1745 : 5274 : unsigned gap
1746 : 2637 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
1747 : 5274 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
1748 : 5274 : % lowest_vf + gap);
1749 : : }
1750 : : }
1751 : : /* Reject vectorizing for a single scalar iteration, even if
1752 : : we could in principle implement that using partial vectors. */
1753 : 43847 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
1754 : 43847 : if (scalar_niters <= peeling_gap + 1)
1755 : : {
1756 : 812 : if (dump_enabled_p ())
1757 : 185 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1758 : : "not vectorized: loop only has a single "
1759 : : "scalar iteration.\n");
1760 : 812 : return 0;
1761 : : }
1762 : :
1763 : 43035 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1764 : : {
1765 : : /* Check that the loop processes at least one full vector. */
1766 : 43024 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1767 : 43024 : if (known_lt (scalar_niters, vf))
1768 : : {
1769 : 361 : if (dump_enabled_p ())
1770 : 293 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1771 : : "loop does not have enough iterations "
1772 : : "to support vectorization.\n");
1773 : 401 : return 0;
1774 : : }
1775 : :
1776 : : /* If we need to peel an extra epilogue iteration to handle data
1777 : : accesses with gaps, check that there are enough scalar iterations
1778 : : available.
1779 : :
1780 : : The check above is redundant with this one when peeling for gaps,
1781 : : but the distinction is useful for diagnostics. */
1782 : 42663 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1783 : 42953 : && known_le (scalar_niters, vf))
1784 : : {
1785 : 40 : if (dump_enabled_p ())
1786 : 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1787 : : "loop does not have enough iterations "
1788 : : "to support peeling for gaps.\n");
1789 : 40 : return 0;
1790 : : }
1791 : : }
1792 : 43847 : }
1793 : :
1794 : : /* If using the "very cheap" model. reject cases in which we'd keep
1795 : : a copy of the scalar code (even if we might be able to vectorize it). */
1796 : 100651 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1797 : 100651 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1798 : 50957 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
1799 : : {
1800 : 721 : if (dump_enabled_p ())
1801 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1802 : : "some scalar iterations would need to be peeled\n");
1803 : 721 : return 0;
1804 : : }
1805 : :
1806 : 99930 : int min_profitable_iters, min_profitable_estimate;
1807 : 99930 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1808 : : &min_profitable_estimate,
1809 : : suggested_unroll_factor);
1810 : :
1811 : 99930 : if (min_profitable_iters < 0)
1812 : : {
1813 : 26011 : if (dump_enabled_p ())
1814 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1815 : : "not vectorized: vectorization not profitable.\n");
1816 : 26011 : if (dump_enabled_p ())
1817 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1818 : : "not vectorized: vector version will never be "
1819 : : "profitable.\n");
1820 : 26011 : return -1;
1821 : : }
1822 : :
1823 : 73919 : int min_scalar_loop_bound = (param_min_vect_loop_bound
1824 : 73919 : * assumed_vf);
1825 : :
1826 : : /* Use the cost model only if it is more conservative than user specified
1827 : : threshold. */
1828 : 73919 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1829 : : min_profitable_iters);
1830 : :
1831 : 73919 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1832 : :
1833 : 37423 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1834 : 111342 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1835 : : {
1836 : 388 : if (dump_enabled_p ())
1837 : 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838 : : "not vectorized: vectorization not profitable.\n");
1839 : 388 : if (dump_enabled_p ())
1840 : 1 : dump_printf_loc (MSG_NOTE, vect_location,
1841 : : "not vectorized: iteration count smaller than user "
1842 : : "specified loop bound parameter or minimum profitable "
1843 : : "iterations (whichever is more conservative).\n");
1844 : 388 : return 0;
1845 : : }
1846 : :
1847 : : /* The static profitablity threshold min_profitable_estimate includes
1848 : : the cost of having to check at runtime whether the scalar loop
1849 : : should be used instead. If it turns out that we don't need or want
1850 : : such a check, the threshold we should use for the static estimate
1851 : : is simply the point at which the vector loop becomes more profitable
1852 : : than the scalar loop. */
1853 : 73531 : if (min_profitable_estimate > min_profitable_iters
1854 : 15648 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1855 : 15138 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1856 : 309 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1857 : 73840 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1858 : : {
1859 : 11 : if (dump_enabled_p ())
1860 : 6 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1861 : : " choice between the scalar and vector loops\n");
1862 : 11 : min_profitable_estimate = min_profitable_iters;
1863 : : }
1864 : :
1865 : : /* If the vector loop needs multiple iterations to be beneficial then
1866 : : things are probably too close to call, and the conservative thing
1867 : : would be to stick with the scalar code. */
1868 : 73531 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1869 : 73531 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1870 : : {
1871 : 8487 : if (dump_enabled_p ())
1872 : 177 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1873 : : "one iteration of the vector loop would be"
1874 : : " more expensive than the equivalent number of"
1875 : : " iterations of the scalar loop\n");
1876 : 8487 : return 0;
1877 : : }
1878 : :
1879 : 65044 : HOST_WIDE_INT estimated_niter;
1880 : :
1881 : : /* If we are vectorizing an epilogue then we know the maximum number of
1882 : : scalar iterations it will cover is at least one lower than the
1883 : : vectorization factor of the main loop. */
1884 : 65044 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1885 : 10706 : estimated_niter
1886 : 10706 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1887 : : else
1888 : : {
1889 : 54338 : estimated_niter = estimated_stmt_executions_int (loop);
1890 : 54338 : if (estimated_niter == -1)
1891 : 20630 : estimated_niter = likely_max_stmt_executions_int (loop);
1892 : : }
1893 : 31336 : if (estimated_niter != -1
1894 : 63419 : && ((unsigned HOST_WIDE_INT) estimated_niter
1895 : 63419 : < MAX (th, (unsigned) min_profitable_estimate)))
1896 : : {
1897 : 4367 : if (dump_enabled_p ())
1898 : 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899 : : "not vectorized: estimated iteration count too "
1900 : : "small.\n");
1901 : 4367 : if (dump_enabled_p ())
1902 : 28 : dump_printf_loc (MSG_NOTE, vect_location,
1903 : : "not vectorized: estimated iteration count smaller "
1904 : : "than specified loop bound parameter or minimum "
1905 : : "profitable iterations (whichever is more "
1906 : : "conservative).\n");
1907 : 4367 : return -1;
1908 : : }
1909 : :
1910 : : return 1;
1911 : : }
1912 : :
1913 : : static opt_result
1914 : 222828 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1915 : : vec<data_reference_p> *datarefs)
1916 : : {
1917 : 682221 : for (unsigned i = 0; i < loop->num_nodes; i++)
1918 : 1008544 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1919 : 3779438 : !gsi_end_p (gsi); gsi_next (&gsi))
1920 : : {
1921 : 3320045 : gimple *stmt = gsi_stmt (gsi);
1922 : 3320045 : if (is_gimple_debug (stmt))
1923 : 1211793 : continue;
1924 : 2108380 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1925 : : NULL, 0);
1926 : 2108380 : if (!res)
1927 : : {
1928 : 45007 : if (is_gimple_call (stmt) && loop->safelen)
1929 : : {
1930 : 398 : tree fndecl = gimple_call_fndecl (stmt), op;
1931 : 398 : if (fndecl == NULL_TREE
1932 : 398 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
1933 : : {
1934 : 0 : fndecl = gimple_call_arg (stmt, 0);
1935 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
1936 : 0 : fndecl = TREE_OPERAND (fndecl, 0);
1937 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
1938 : : }
1939 : 398 : if (fndecl != NULL_TREE)
1940 : : {
1941 : 361 : cgraph_node *node = cgraph_node::get (fndecl);
1942 : 361 : if (node != NULL && node->simd_clones != NULL)
1943 : : {
1944 : 129 : unsigned int j, n = gimple_call_num_args (stmt);
1945 : 539 : for (j = 0; j < n; j++)
1946 : : {
1947 : 282 : op = gimple_call_arg (stmt, j);
1948 : 282 : if (DECL_P (op)
1949 : 282 : || (REFERENCE_CLASS_P (op)
1950 : 0 : && get_base_address (op)))
1951 : : break;
1952 : : }
1953 : 129 : op = gimple_call_lhs (stmt);
1954 : : /* Ignore #pragma omp declare simd functions
1955 : : if they don't have data references in the
1956 : : call stmt itself. */
1957 : 257 : if (j == n
1958 : 129 : && !(op
1959 : 118 : && (DECL_P (op)
1960 : 118 : || (REFERENCE_CLASS_P (op)
1961 : 0 : && get_base_address (op)))))
1962 : 128 : continue;
1963 : : }
1964 : : }
1965 : : }
1966 : 44879 : return res;
1967 : : }
1968 : : /* If dependence analysis will give up due to the limit on the
1969 : : number of datarefs stop here and fail fatally. */
1970 : 3622759 : if (datarefs->length ()
1971 : 1559386 : > (unsigned)param_loop_max_datarefs_for_datadeps)
1972 : 0 : return opt_result::failure_at (stmt, "exceeded param "
1973 : : "loop-max-datarefs-for-datadeps\n");
1974 : : }
1975 : 177949 : return opt_result::success ();
1976 : : }
1977 : :
1978 : : /* Determine if operating on full vectors for LOOP_VINFO might leave
1979 : : some scalar iterations still to do. If so, decide how we should
1980 : : handle those scalar iterations. The possibilities are:
1981 : :
1982 : : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
1983 : : In this case:
1984 : :
1985 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
1986 : : LOOP_VINFO_PEELING_FOR_NITER == false
1987 : :
1988 : : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
1989 : : to handle the remaining scalar iterations. In this case:
1990 : :
1991 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
1992 : : LOOP_VINFO_PEELING_FOR_NITER == true
1993 : :
1994 : : The MASKED_P argument specifies to what extent
1995 : : param_vect_partial_vector_usage is to be honored. For MASKED_P == 0
1996 : : no partial vectors are to be used, for MASKED_P == -1 it's
1997 : : param_vect_partial_vector_usage that gets to decide whether we may
1998 : : consider partial vector usage. For MASKED_P == 1 partial vectors
1999 : : may be used if possible.
2000 : :
2001 : : */
2002 : :
2003 : : static opt_result
2004 : 113213 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2005 : : int masked_p)
2006 : : {
2007 : : /* Determine whether there would be any scalar iterations left over. */
2008 : 113213 : bool need_peeling_or_partial_vectors_p
2009 : 113213 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2010 : :
2011 : : /* Decide whether to vectorize the loop with partial vectors. */
2012 : 113213 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2013 : 113213 : if (masked_p == 0
2014 : 113213 : || (masked_p == -1 && param_vect_partial_vector_usage == 0))
2015 : : /* If requested explicitly do not use partial vectors. */
2016 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2017 : 121 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2018 : 42 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2019 : 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2020 : 121 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2021 : 42 : && need_peeling_or_partial_vectors_p)
2022 : : {
2023 : : /* For partial-vector-usage=1, try to push the handling of partial
2024 : : vectors to the epilogue, with the main loop continuing to operate
2025 : : on full vectors.
2026 : :
2027 : : If we are unrolling we also do not want to use partial vectors. This
2028 : : is to avoid the overhead of generating multiple masks and also to
2029 : : avoid having to execute entire iterations of FALSE masked instructions
2030 : : when dealing with one or less full iterations.
2031 : :
2032 : : ??? We could then end up failing to use partial vectors if we
2033 : : decide to peel iterations into a prologue, and if the main loop
2034 : : then ends up processing fewer than VF iterations. */
2035 : 34 : if ((param_vect_partial_vector_usage == 1
2036 : 8 : || loop_vinfo->suggested_unroll_factor > 1)
2037 : 26 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2038 : 52 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2039 : : ;
2040 : : else
2041 : 26 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2042 : : }
2043 : :
2044 : 113213 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2045 : 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2046 : 0 : return opt_result::failure_at (vect_location,
2047 : : "not vectorized: loop needs but cannot "
2048 : : "use partial vectors\n");
2049 : :
2050 : 113213 : if (dump_enabled_p ())
2051 : 11542 : dump_printf_loc (MSG_NOTE, vect_location,
2052 : : "operating on %s vectors%s.\n",
2053 : 11542 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2054 : : ? "partial" : "full",
2055 : 11542 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2056 : : ? " for epilogue loop" : "");
2057 : :
2058 : 113213 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2059 : 226426 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2060 : 113213 : && need_peeling_or_partial_vectors_p);
2061 : :
2062 : 113213 : return opt_result::success ();
2063 : : }
2064 : :
2065 : : /* Function vect_analyze_loop_2.
2066 : :
2067 : : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2068 : : analyses will record information in some members of LOOP_VINFO. FATAL
2069 : : indicates if some analysis meets fatal error. If one non-NULL pointer
2070 : : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2071 : : worked out suggested unroll factor, while one NULL pointer shows it's
2072 : : going to apply the suggested unroll factor.
2073 : : SINGLE_LANE_SLP_DONE_FOR_SUGGESTED_UF is to hold whether single-lane
2074 : : slp was forced when the suggested unroll factor was worked out. */
2075 : : static opt_result
2076 : 416625 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, int masked_p, bool &fatal,
2077 : : unsigned *suggested_unroll_factor,
2078 : : bool& single_lane_slp_done_for_suggested_uf)
2079 : : {
2080 : 416625 : opt_result ok = opt_result::success ();
2081 : 416625 : int res;
2082 : 416625 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2083 : 416625 : loop_vec_info orig_loop_vinfo = NULL;
2084 : :
2085 : : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2086 : : loop_vec_info of the first vectorized loop. */
2087 : 416625 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2088 : 17729 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2089 : : else
2090 : : orig_loop_vinfo = loop_vinfo;
2091 : 17729 : gcc_assert (orig_loop_vinfo);
2092 : :
2093 : : /* The first group of checks is independent of the vector size. */
2094 : 416625 : fatal = true;
2095 : :
2096 : 416625 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2097 : 416625 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2098 : 5 : return opt_result::failure_at (vect_location,
2099 : : "not vectorized: simd if(0)\n");
2100 : :
2101 : : /* Find all data references in the loop (which correspond to vdefs/vuses)
2102 : : and analyze their evolution in the loop. */
2103 : :
2104 : 416620 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2105 : :
2106 : : /* Gather the data references and count stmts in the loop. */
2107 : 416620 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2108 : : {
2109 : 222828 : opt_result res
2110 : 222828 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2111 : : &LOOP_VINFO_DATAREFS (loop_vinfo));
2112 : 222828 : if (!res)
2113 : : {
2114 : 44879 : if (dump_enabled_p ())
2115 : 1561 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2116 : : "not vectorized: loop contains function "
2117 : : "calls or data references that cannot "
2118 : : "be analyzed\n");
2119 : 44879 : return res;
2120 : : }
2121 : 177949 : loop_vinfo->shared->save_datarefs ();
2122 : : }
2123 : : else
2124 : 193792 : loop_vinfo->shared->check_datarefs ();
2125 : :
2126 : : /* Analyze the data references and also adjust the minimal
2127 : : vectorization factor according to the loads and stores. */
2128 : :
2129 : 371741 : ok = vect_analyze_data_refs (loop_vinfo, &fatal);
2130 : 371741 : if (!ok)
2131 : : {
2132 : 50105 : if (dump_enabled_p ())
2133 : 982 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2134 : : "bad data references.\n");
2135 : 50105 : return ok;
2136 : : }
2137 : :
2138 : : /* Check if we are applying unroll factor now. */
2139 : 321636 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2140 : 321636 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2141 : :
2142 : : /* When single-lane SLP was forced and we are applying suggested unroll
2143 : : factor, keep that decision here. */
2144 : 643272 : bool force_single_lane = (applying_suggested_uf
2145 : 321636 : && single_lane_slp_done_for_suggested_uf);
2146 : :
2147 : : /* Classify all cross-iteration scalar data-flow cycles.
2148 : : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2149 : 321636 : vect_analyze_scalar_cycles (loop_vinfo);
2150 : :
2151 : 321636 : vect_pattern_recog (loop_vinfo);
2152 : :
2153 : : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2154 : : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2155 : :
2156 : 321636 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2157 : 321636 : if (!ok)
2158 : : {
2159 : 7082 : if (dump_enabled_p ())
2160 : 264 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2161 : : "bad data access.\n");
2162 : 7082 : return ok;
2163 : : }
2164 : :
2165 : : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2166 : :
2167 : 314554 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2168 : 314554 : if (!ok)
2169 : : {
2170 : 13565 : if (dump_enabled_p ())
2171 : 298 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2172 : : "unexpected pattern.\n");
2173 : 13565 : return ok;
2174 : : }
2175 : :
2176 : : /* While the rest of the analysis below depends on it in some way. */
2177 : 300989 : fatal = false;
2178 : :
2179 : : /* Analyze data dependences between the data-refs in the loop
2180 : : and adjust the maximum vectorization factor according to
2181 : : the dependences.
2182 : : FORNOW: fail at the first data dependence that we encounter. */
2183 : :
2184 : 300989 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2185 : 300989 : if (!ok)
2186 : : {
2187 : 14493 : if (dump_enabled_p ())
2188 : 372 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2189 : : "bad data dependence.\n");
2190 : 14493 : return ok;
2191 : : }
2192 : 286496 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2193 : :
2194 : : /* Compute the scalar iteration cost. */
2195 : 286496 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2196 : :
2197 : 286496 : bool saved_can_use_partial_vectors_p
2198 : : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2199 : :
2200 : : /* This is the point where we can re-start analysis with single-lane
2201 : : SLP forced. */
2202 : 409980 : start_over:
2203 : :
2204 : : /* Check the SLP opportunities in the loop, analyze and build
2205 : : SLP trees. */
2206 : 819960 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2207 : : force_single_lane);
2208 : 409980 : if (!ok)
2209 : 18489 : return ok;
2210 : :
2211 : : /* If there are any SLP instances mark them as pure_slp and compute
2212 : : the overall vectorization factor. */
2213 : 391491 : if (!vect_make_slp_decision (loop_vinfo))
2214 : 45422 : return opt_result::failure_at (vect_location, "no stmts to vectorize.\n");
2215 : :
2216 : 346069 : if (dump_enabled_p ())
2217 : 17801 : dump_printf_loc (MSG_NOTE, vect_location, "Loop contains only SLP stmts\n");
2218 : :
2219 : : /* Dump the vectorization factor from the SLP decision. */
2220 : 346069 : if (dump_enabled_p ())
2221 : : {
2222 : 17801 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
2223 : 17801 : dump_dec (MSG_NOTE, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2224 : 17801 : dump_printf (MSG_NOTE, "\n");
2225 : : }
2226 : :
2227 : : /* We don't expect to have to roll back to anything other than an empty
2228 : : set of rgroups. */
2229 : 346069 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2230 : :
2231 : : /* Apply the suggested unrolling factor, this was determined by the backend
2232 : : during finish_cost the first time we ran the analyzis for this
2233 : : vector mode. */
2234 : 346069 : if (applying_suggested_uf)
2235 : 247 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2236 : :
2237 : : /* Now the vectorization factor is final. */
2238 : 346069 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2239 : 346069 : gcc_assert (known_ne (vectorization_factor, 0U));
2240 : :
2241 : : /* Optimize the SLP graph with the vectorization factor fixed. */
2242 : 346069 : vect_optimize_slp (loop_vinfo);
2243 : :
2244 : : /* Gather the loads reachable from the SLP graph entries. */
2245 : 346069 : vect_gather_slp_loads (loop_vinfo);
2246 : :
2247 : 346069 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2248 : : {
2249 : 13405 : dump_printf_loc (MSG_NOTE, vect_location,
2250 : : "vectorization_factor = ");
2251 : 13405 : dump_dec (MSG_NOTE, vectorization_factor);
2252 : 13405 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2253 : 13405 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2254 : : }
2255 : :
2256 : 346069 : if (max_vf != MAX_VECTORIZATION_FACTOR
2257 : 346069 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2258 : 41 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2259 : :
2260 : 346028 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2261 : :
2262 : : /* Analyze the alignment of the data-refs in the loop.
2263 : : Fail if a data reference is found that cannot be vectorized. */
2264 : :
2265 : 346028 : ok = vect_analyze_data_refs_alignment (loop_vinfo);
2266 : 346028 : if (!ok)
2267 : : {
2268 : 0 : if (dump_enabled_p ())
2269 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2270 : : "bad data alignment.\n");
2271 : 0 : return ok;
2272 : : }
2273 : :
2274 : : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2275 : : It is important to call pruning after vect_analyze_data_ref_accesses,
2276 : : since we use grouping information gathered by interleaving analysis. */
2277 : 346028 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2278 : 346028 : if (!ok)
2279 : 16797 : return ok;
2280 : :
2281 : : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2282 : : vectorization, since we do not want to add extra peeling or
2283 : : add versioning for alignment. */
2284 : 329231 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2285 : : /* This pass will decide on using loop versioning and/or loop peeling in
2286 : : order to enhance the alignment of data references in the loop. */
2287 : 314373 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2288 : 329231 : if (!ok)
2289 : 0 : return ok;
2290 : :
2291 : : /* Analyze operations in the SLP instances. We can't simply
2292 : : remove unsupported SLP instances as this makes the above
2293 : : SLP kind detection invalid and might also affect the VF. */
2294 : 329231 : if (! vect_slp_analyze_operations (loop_vinfo))
2295 : : {
2296 : 216018 : ok = opt_result::failure_at (vect_location,
2297 : : "unsupported SLP instances\n");
2298 : 216018 : goto again;
2299 : : }
2300 : :
2301 : : /* For now, we don't expect to mix both masking and length approaches for one
2302 : : loop, disable it if both are recorded. */
2303 : 113213 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2304 : 16826 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2305 : 130033 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2306 : : {
2307 : 0 : if (dump_enabled_p ())
2308 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309 : : "can't vectorize a loop with partial vectors"
2310 : : " because we don't expect to mix different"
2311 : : " approaches with partial vectors for the"
2312 : : " same loop.\n");
2313 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2314 : : }
2315 : :
2316 : : /* If we still have the option of using partial vectors,
2317 : : check whether we can generate the necessary loop controls. */
2318 : 113213 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2319 : : {
2320 : 16826 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2321 : : {
2322 : 16820 : if (!vect_verify_full_masking (loop_vinfo)
2323 : 16820 : && !vect_verify_full_masking_avx512 (loop_vinfo))
2324 : 3681 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2325 : : }
2326 : : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2327 : 6 : if (!vect_verify_loop_lens (loop_vinfo))
2328 : 6 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2329 : : }
2330 : :
2331 : : /* Decide whether this loop_vinfo should use partial vectors or peeling,
2332 : : assuming that the loop will be used as a main loop. We will redo
2333 : : this analysis later if we instead decide to use the loop as an
2334 : : epilogue loop. */
2335 : 113213 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, masked_p);
2336 : 113213 : if (!ok)
2337 : 0 : return ok;
2338 : :
2339 : : /* If we're vectorizing a loop that uses length "controls" and
2340 : : can iterate more than once, we apply decrementing IV approach
2341 : : in loop control. */
2342 : 113213 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2343 : 26 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2344 : 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2345 : 113213 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2346 : 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2347 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2348 : 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2349 : :
2350 : : /* If a loop uses length controls and has a decrementing loop control IV,
2351 : : we will normally pass that IV through a MIN_EXPR to calcaluate the
2352 : : basis for the length controls. E.g. in a loop that processes one
2353 : : element per scalar iteration, the number of elements would be
2354 : : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2355 : :
2356 : : This MIN_EXPR approach allows us to use pointer IVs with an invariant
2357 : : step, since only the final iteration of the vector loop can have
2358 : : inactive lanes.
2359 : :
2360 : : However, some targets have a dedicated instruction for calculating the
2361 : : preferred length, given the total number of elements that still need to
2362 : : be processed. This is encapsulated in the SELECT_VL internal function.
2363 : :
2364 : : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2365 : : to determine the basis for the length controls. However, unlike the
2366 : : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2367 : : lanes inactive in any iteration of the vector loop, not just the last
2368 : : iteration. This SELECT_VL approach therefore requires us to use pointer
2369 : : IVs with variable steps.
2370 : :
2371 : : Once we've decided how many elements should be processed by one
2372 : : iteration of the vector loop, we need to populate the rgroup controls.
2373 : : If a loop has multiple rgroups, we need to make sure that those rgroups
2374 : : "line up" (that is, they must be consistent about which elements are
2375 : : active and which aren't). This is done by vect_adjust_loop_lens_control.
2376 : :
2377 : : In principle, it would be possible to use vect_adjust_loop_lens_control
2378 : : on either the result of a MIN_EXPR or the result of a SELECT_VL.
2379 : : However:
2380 : :
2381 : : (1) In practice, it only makes sense to use SELECT_VL when a vector
2382 : : operation will be controlled directly by the result. It is not
2383 : : worth using SELECT_VL if it would only be the input to other
2384 : : calculations.
2385 : :
2386 : : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2387 : : pointer IV will need N updates by a variable amount (N-1 updates
2388 : : within the iteration and 1 update to move to the next iteration).
2389 : :
2390 : : Because of this, we prefer to use the MIN_EXPR approach whenever there
2391 : : is more than one length control.
2392 : :
2393 : : In addition, SELECT_VL always operates to a granularity of 1 unit.
2394 : : If we wanted to use it to control an SLP operation on N consecutive
2395 : : elements, we would need to make the SELECT_VL inputs measure scalar
2396 : : iterations (rather than elements) and then multiply the SELECT_VL
2397 : : result by N. But using SELECT_VL this way is inefficient because
2398 : : of (1) above.
2399 : :
2400 : : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2401 : : satisfied:
2402 : :
2403 : : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2404 : : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2405 : :
2406 : : Since SELECT_VL (variable step) will make SCEV analysis failed and then
2407 : : we will fail to gain benefits of following unroll optimizations. We prefer
2408 : : using the MIN_EXPR approach in this situation. */
2409 : 113213 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2410 : : {
2411 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2412 : 0 : if (LOOP_VINFO_LENS (loop_vinfo).length () == 1
2413 : 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
2414 : 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2415 : : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2416 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2417 : :
2418 : 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2419 : 0 : for (auto rgc : LOOP_VINFO_LENS (loop_vinfo))
2420 : 0 : if (rgc.type
2421 : 0 : && !direct_internal_fn_supported_p (IFN_SELECT_VL,
2422 : : rgc.type, iv_type,
2423 : : OPTIMIZE_FOR_SPEED))
2424 : : {
2425 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2426 : 0 : break;
2427 : : }
2428 : :
2429 : : /* If any of the SLP instances cover more than a single lane
2430 : : we cannot use .SELECT_VL at the moment, even if the number
2431 : : of lanes is uniform throughout the SLP graph. */
2432 : 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
2433 : 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
2434 : 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
2435 : 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
2436 : 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
2437 : : {
2438 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2439 : 0 : break;
2440 : : }
2441 : : }
2442 : :
2443 : : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2444 : : to be able to handle fewer than VF scalars, or needs to have a lower VF
2445 : : than the main loop. */
2446 : 113213 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2447 : 12458 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2448 : : {
2449 : 12448 : poly_uint64 unscaled_vf
2450 : 12448 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2451 : : orig_loop_vinfo->suggested_unroll_factor);
2452 : 12448 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
2453 : 285 : return opt_result::failure_at (vect_location,
2454 : : "Vectorization factor too high for"
2455 : : " epilogue loop.\n");
2456 : : }
2457 : :
2458 : : /* If the epilogue needs peeling for gaps but the main loop doesn't give
2459 : : up on the epilogue. */
2460 : 112928 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2461 : 12173 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2462 : 67 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
2463 : : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2464 : 4 : return opt_result::failure_at (vect_location,
2465 : : "Epilogue loop requires peeling for gaps "
2466 : : "but main loop does not.\n");
2467 : :
2468 : : /* If an epilogue loop is required make sure we can create one. */
2469 : 112924 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2470 : 111690 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2471 : 32403 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
2472 : : {
2473 : 81555 : if (dump_enabled_p ())
2474 : 5146 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2475 : 81555 : if (!vect_can_advance_ivs_p (loop_vinfo)
2476 : 162612 : || !slpeel_can_duplicate_loop_p (loop,
2477 : : LOOP_VINFO_IV_EXIT (loop_vinfo),
2478 : 81057 : LOOP_VINFO_IV_EXIT (loop_vinfo)))
2479 : : {
2480 : 498 : ok = opt_result::failure_at (vect_location,
2481 : : "not vectorized: can't create required "
2482 : : "epilog loop\n");
2483 : 498 : goto again;
2484 : : }
2485 : : }
2486 : :
2487 : : /* Check the costings of the loop make vectorizing worthwhile. */
2488 : 112426 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2489 : 112426 : if (res < 0)
2490 : : {
2491 : 30378 : ok = opt_result::failure_at (vect_location,
2492 : : "Loop costings may not be worthwhile.\n");
2493 : 30378 : goto again;
2494 : : }
2495 : 82048 : if (!res)
2496 : 21371 : return opt_result::failure_at (vect_location,
2497 : : "Loop costings not worthwhile.\n");
2498 : :
2499 : : /* During peeling, we need to check if number of loop iterations is
2500 : : enough for both peeled prolog loop and vector loop. This check
2501 : : can be merged along with threshold check of loop versioning, so
2502 : : increase threshold for this case if necessary.
2503 : :
2504 : : If we are analyzing an epilogue we still want to check what its
2505 : : versioning threshold would be. If we decide to vectorize the epilogues we
2506 : : will want to use the lowest versioning threshold of all epilogues and main
2507 : : loop. This will enable us to enter a vectorized epilogue even when
2508 : : versioning the loop. We can't simply check whether the epilogue requires
2509 : : versioning though since we may have skipped some versioning checks when
2510 : : analyzing the epilogue. For instance, checks for alias versioning will be
2511 : : skipped when dealing with epilogues as we assume we already checked them
2512 : : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2513 : 60677 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2514 : : {
2515 : 5653 : poly_uint64 niters_th = 0;
2516 : 5653 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2517 : :
2518 : 5653 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2519 : : {
2520 : : /* Niters for peeled prolog loop. */
2521 : 5653 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2522 : : {
2523 : 126 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2524 : 126 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2525 : 126 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2526 : : }
2527 : : else
2528 : 5527 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2529 : : }
2530 : :
2531 : : /* Niters for at least one iteration of vectorized loop. */
2532 : 5653 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2533 : 5649 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2534 : : /* One additional iteration because of peeling for gap. */
2535 : 5653 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2536 : 59 : niters_th += 1;
2537 : :
2538 : : /* Use the same condition as vect_transform_loop to decide when to use
2539 : : the cost to determine a versioning threshold. */
2540 : 5653 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2541 : 5653 : && ordered_p (th, niters_th))
2542 : 3841 : niters_th = ordered_max (poly_uint64 (th), niters_th);
2543 : :
2544 : 5653 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2545 : : }
2546 : :
2547 : 60677 : gcc_assert (known_eq (vectorization_factor,
2548 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2549 : :
2550 : 60677 : single_lane_slp_done_for_suggested_uf = force_single_lane;
2551 : :
2552 : : /* Ok to vectorize! */
2553 : 60677 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2554 : 60677 : return opt_result::success ();
2555 : :
2556 : 246894 : again:
2557 : : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2558 : 246894 : gcc_assert (!ok);
2559 : :
2560 : : /* Try again with single-lane SLP. */
2561 : 246894 : if (force_single_lane)
2562 : 122341 : return ok;
2563 : :
2564 : : /* If we are applying suggested unroll factor, we don't need to
2565 : : re-try any more as we want to keep the SLP mode fixed. */
2566 : 124553 : if (applying_suggested_uf)
2567 : 6 : return ok;
2568 : :
2569 : : /* Likewise if the grouped loads or stores in the SLP cannot be handled
2570 : : via interleaving or lane instructions. */
2571 : : slp_instance instance;
2572 : : slp_tree node;
2573 : : unsigned i, j;
2574 : 330713 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2575 : : {
2576 : 207229 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
2577 : 0 : continue;
2578 : :
2579 : 207229 : stmt_vec_info vinfo;
2580 : 207229 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2581 : 207229 : if (!vinfo || !STMT_VINFO_GROUPED_ACCESS (vinfo))
2582 : 204630 : continue;
2583 : 2599 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2584 : 2599 : unsigned int size = DR_GROUP_SIZE (vinfo);
2585 : 2599 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
2586 : 2599 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
2587 : 4514 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2588 : 5192 : && ! vect_grouped_store_supported (vectype, size))
2589 : 678 : return opt_result::failure_at (vinfo->stmt,
2590 : : "unsupported grouped store\n");
2591 : 209444 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2592 : : {
2593 : 2079 : vinfo = SLP_TREE_REPRESENTATIVE (node);
2594 : 2079 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
2595 : : {
2596 : 1820 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2597 : 1820 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2598 : 1820 : size = DR_GROUP_SIZE (vinfo);
2599 : 1820 : vectype = SLP_TREE_VECTYPE (node);
2600 : 1820 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
2601 : 1820 : && ! vect_grouped_load_supported (vectype, single_element_p,
2602 : : size))
2603 : 385 : return opt_result::failure_at (vinfo->stmt,
2604 : : "unsupported grouped load\n");
2605 : : }
2606 : : }
2607 : : }
2608 : :
2609 : : /* Roll back state appropriately. Force single-lane SLP this time. */
2610 : 123484 : force_single_lane = true;
2611 : 123484 : if (dump_enabled_p ())
2612 : 3219 : dump_printf_loc (MSG_NOTE, vect_location,
2613 : : "re-trying with single-lane SLP\n");
2614 : :
2615 : : /* Reset the vectorization factor. */
2616 : 123484 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = 0;
2617 : : /* Free the SLP instances. */
2618 : 329643 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2619 : 206159 : vect_free_slp_instance (instance);
2620 : 123484 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2621 : : /* Reset SLP type to loop_vect on all stmts. */
2622 : 479977 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2623 : : {
2624 : 356493 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2625 : 356493 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
2626 : 636689 : !gsi_end_p (si); gsi_next (&si))
2627 : : {
2628 : 280196 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2629 : 280196 : STMT_SLP_TYPE (stmt_info) = not_vect;
2630 : 280196 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2631 : 280196 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2632 : : {
2633 : : /* vectorizable_reduction adjusts reduction stmt def-types,
2634 : : restore them to that of the PHI. */
2635 : 19852 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2636 : 19852 : = STMT_VINFO_DEF_TYPE (stmt_info);
2637 : 19852 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2638 : : (STMT_VINFO_REDUC_DEF (stmt_info)))
2639 : 19852 : = STMT_VINFO_DEF_TYPE (stmt_info);
2640 : : }
2641 : : }
2642 : 712986 : for (gimple_stmt_iterator si = gsi_start_bb (bb);
2643 : 2186064 : !gsi_end_p (si); gsi_next (&si))
2644 : : {
2645 : 1829571 : if (is_gimple_debug (gsi_stmt (si)))
2646 : 676364 : continue;
2647 : 1153207 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2648 : 1153207 : STMT_SLP_TYPE (stmt_info) = not_vect;
2649 : 1153207 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2650 : : {
2651 : 214127 : stmt_vec_info pattern_stmt_info
2652 : : = STMT_VINFO_RELATED_STMT (stmt_info);
2653 : 214127 : if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2654 : 0 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2655 : :
2656 : 214127 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2657 : 214127 : STMT_SLP_TYPE (pattern_stmt_info) = not_vect;
2658 : 214127 : for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2659 : 435145 : !gsi_end_p (pi); gsi_next (&pi))
2660 : 221018 : STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2661 : 221018 : = not_vect;
2662 : : }
2663 : : }
2664 : : }
2665 : : /* Free optimized alias test DDRS. */
2666 : 123484 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2667 : 123484 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2668 : 123484 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2669 : : /* Reset target cost data. */
2670 : 123484 : delete loop_vinfo->vector_costs;
2671 : 123484 : loop_vinfo->vector_costs = nullptr;
2672 : : /* Reset accumulated rgroup information. */
2673 : 123484 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
2674 : 123484 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
2675 : 123484 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2676 : : /* Reset assorted flags. */
2677 : 123484 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2678 : 123484 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2679 : 123484 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2680 : 123484 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2681 : 123484 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2682 : 123484 : = saved_can_use_partial_vectors_p;
2683 : 123484 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2684 : 123484 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2685 : 123484 : if (loop_vinfo->scan_map)
2686 : 122 : loop_vinfo->scan_map->empty ();
2687 : :
2688 : 123484 : goto start_over;
2689 : : }
2690 : :
2691 : : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2692 : : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2693 : : OLD_LOOP_VINFO is better unless something specifically indicates
2694 : : otherwise.
2695 : :
2696 : : Note that this deliberately isn't a partial order. */
2697 : :
2698 : : static bool
2699 : 0 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2700 : : loop_vec_info old_loop_vinfo)
2701 : : {
2702 : 0 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2703 : 0 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2704 : :
2705 : 0 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2706 : 0 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2707 : :
2708 : : /* Always prefer a VF of loop->simdlen over any other VF. */
2709 : 0 : if (loop->simdlen)
2710 : : {
2711 : 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2712 : 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2713 : 0 : if (new_simdlen_p != old_simdlen_p)
2714 : : return new_simdlen_p;
2715 : : }
2716 : :
2717 : 0 : const auto *old_costs = old_loop_vinfo->vector_costs;
2718 : 0 : const auto *new_costs = new_loop_vinfo->vector_costs;
2719 : 0 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2720 : 0 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2721 : :
2722 : 0 : return new_costs->better_main_loop_than_p (old_costs);
2723 : : }
2724 : :
2725 : : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2726 : : true if we should. */
2727 : :
2728 : : static bool
2729 : 0 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2730 : : loop_vec_info old_loop_vinfo)
2731 : : {
2732 : 0 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2733 : : return false;
2734 : :
2735 : 0 : if (dump_enabled_p ())
2736 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2737 : : "***** Preferring vector mode %s to vector mode %s\n",
2738 : 0 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
2739 : 0 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
2740 : : return true;
2741 : : }
2742 : :
2743 : : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
2744 : : not NULL. When MASKED_P is not -1 override the default
2745 : : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P with it.
2746 : : Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance MODE_I to the next
2747 : : mode useful to analyze.
2748 : : Return the loop_vinfo on success and wrapped null on failure. */
2749 : :
2750 : : static opt_loop_vec_info
2751 : 416378 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2752 : : const vect_loop_form_info *loop_form_info,
2753 : : loop_vec_info orig_loop_vinfo,
2754 : : const vector_modes &vector_modes, unsigned &mode_i,
2755 : : int masked_p,
2756 : : machine_mode &autodetected_vector_mode,
2757 : : bool &fatal)
2758 : : {
2759 : 416378 : loop_vec_info loop_vinfo
2760 : 416378 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
2761 : :
2762 : 416378 : machine_mode vector_mode = vector_modes[mode_i];
2763 : 416378 : loop_vinfo->vector_mode = vector_mode;
2764 : 416378 : unsigned int suggested_unroll_factor = 1;
2765 : 416378 : bool single_lane_slp_done_for_suggested_uf = false;
2766 : :
2767 : : /* Run the main analysis. */
2768 : 416378 : opt_result res = vect_analyze_loop_2 (loop_vinfo, masked_p, fatal,
2769 : : &suggested_unroll_factor,
2770 : : single_lane_slp_done_for_suggested_uf);
2771 : 416378 : if (dump_enabled_p ())
2772 : 19370 : dump_printf_loc (MSG_NOTE, vect_location,
2773 : : "***** Analysis %s with vector mode %s\n",
2774 : 19370 : res ? "succeeded" : "failed",
2775 : 19370 : GET_MODE_NAME (loop_vinfo->vector_mode));
2776 : :
2777 : 416378 : auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
2778 : 416378 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2779 : : /* Check to see if the user wants to unroll or if the target wants to. */
2780 : 470035 : && (suggested_unroll_factor > 1 || user_unroll > 1))
2781 : : {
2782 : 261 : if (suggested_unroll_factor == 1)
2783 : : {
2784 : 44 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
2785 : 44 : suggested_unroll_factor = user_unroll / assumed_vf;
2786 : 44 : if (suggested_unroll_factor > 1)
2787 : : {
2788 : 30 : if (dump_enabled_p ())
2789 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
2790 : : "setting unroll factor to %d based on user requested "
2791 : : "unroll factor %d and suggested vectorization "
2792 : : "factor: %d\n",
2793 : : suggested_unroll_factor, user_unroll, assumed_vf);
2794 : : }
2795 : : }
2796 : :
2797 : 261 : if (suggested_unroll_factor > 1)
2798 : : {
2799 : 247 : if (dump_enabled_p ())
2800 : 44 : dump_printf_loc (MSG_NOTE, vect_location,
2801 : : "***** Re-trying analysis for unrolling"
2802 : : " with unroll factor %d and %s slp.\n",
2803 : : suggested_unroll_factor,
2804 : : single_lane_slp_done_for_suggested_uf
2805 : : ? "single-lane" : "");
2806 : 247 : loop_vec_info unroll_vinfo
2807 : 247 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
2808 : 247 : unroll_vinfo->vector_mode = vector_mode;
2809 : 247 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2810 : 247 : opt_result new_res
2811 : 247 : = vect_analyze_loop_2 (unroll_vinfo, masked_p, fatal, NULL,
2812 : : single_lane_slp_done_for_suggested_uf);
2813 : 247 : if (new_res)
2814 : : {
2815 : 201 : delete loop_vinfo;
2816 : 201 : loop_vinfo = unroll_vinfo;
2817 : : }
2818 : : else
2819 : 46 : delete unroll_vinfo;
2820 : : }
2821 : :
2822 : : /* Record that we have honored a user unroll factor. */
2823 : 261 : LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
2824 : : }
2825 : :
2826 : : /* Remember the autodetected vector mode. */
2827 : 416378 : if (vector_mode == VOIDmode)
2828 : 214905 : autodetected_vector_mode = loop_vinfo->vector_mode;
2829 : :
2830 : : /* Advance mode_i, first skipping modes that would result in the
2831 : : same analysis result. */
2832 : 1886302 : while (mode_i + 1 < vector_modes.length ()
2833 : 1321829 : && vect_chooses_same_modes_p (loop_vinfo,
2834 : 586867 : vector_modes[mode_i + 1]))
2835 : : {
2836 : 318584 : if (dump_enabled_p ())
2837 : 16094 : dump_printf_loc (MSG_NOTE, vect_location,
2838 : : "***** The result for vector mode %s would"
2839 : : " be the same\n",
2840 : 16094 : GET_MODE_NAME (vector_modes[mode_i + 1]));
2841 : 318584 : mode_i += 1;
2842 : : }
2843 : 416378 : if (mode_i + 1 < vector_modes.length ()
2844 : 684661 : && vect_chooses_same_modes_p (autodetected_vector_mode,
2845 : 268283 : vector_modes[mode_i + 1]))
2846 : : {
2847 : 349 : if (dump_enabled_p ())
2848 : 10 : dump_printf_loc (MSG_NOTE, vect_location,
2849 : : "***** Skipping vector mode %s, which would"
2850 : : " repeat the analysis for %s\n",
2851 : 10 : GET_MODE_NAME (vector_modes[mode_i + 1]),
2852 : 10 : GET_MODE_NAME (autodetected_vector_mode));
2853 : 349 : mode_i += 1;
2854 : : }
2855 : 416378 : mode_i++;
2856 : :
2857 : 416378 : if (!res)
2858 : : {
2859 : 355902 : delete loop_vinfo;
2860 : 355902 : if (fatal)
2861 : 65430 : gcc_checking_assert (orig_loop_vinfo == NULL);
2862 : 355902 : return opt_loop_vec_info::propagate_failure (res);
2863 : : }
2864 : :
2865 : 60476 : return opt_loop_vec_info::success (loop_vinfo);
2866 : : }
2867 : :
2868 : : /* Function vect_analyze_loop.
2869 : :
2870 : : Apply a set of analyses on LOOP, and create a loop_vec_info struct
2871 : : for it. The different analyses will record information in the
2872 : : loop_vec_info struct. */
2873 : : opt_loop_vec_info
2874 : 481474 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
2875 : : vec_info_shared *shared)
2876 : : {
2877 : 481474 : DUMP_VECT_SCOPE ("analyze_loop_nest");
2878 : :
2879 : 481474 : if (loop_outer (loop)
2880 : 481474 : && loop_vec_info_for_loop (loop_outer (loop))
2881 : 481975 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2882 : 501 : return opt_loop_vec_info::failure_at (vect_location,
2883 : : "outer-loop already vectorized.\n");
2884 : :
2885 : 480973 : if (!find_loop_nest (loop, &shared->loop_nest))
2886 : 24698 : return opt_loop_vec_info::failure_at
2887 : 24698 : (vect_location,
2888 : : "not vectorized: loop nest containing two or more consecutive inner"
2889 : : " loops cannot be vectorized\n");
2890 : :
2891 : : /* Analyze the loop form. */
2892 : 456275 : vect_loop_form_info loop_form_info;
2893 : 456275 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
2894 : : &loop_form_info);
2895 : 456275 : if (!res)
2896 : : {
2897 : 241370 : if (dump_enabled_p ())
2898 : 1644 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2899 : : "bad loop form.\n");
2900 : 241370 : return opt_loop_vec_info::propagate_failure (res);
2901 : : }
2902 : 214905 : if (!integer_onep (loop_form_info.assumptions))
2903 : : {
2904 : : /* We consider to vectorize this loop by versioning it under
2905 : : some assumptions. In order to do this, we need to clear
2906 : : existing information computed by scev and niter analyzer. */
2907 : 10076 : scev_reset_htab ();
2908 : 10076 : free_numbers_of_iterations_estimates (loop);
2909 : : /* Also set flag for this loop so that following scev and niter
2910 : : analysis are done under the assumptions. */
2911 : 10076 : loop_constraint_set (loop, LOOP_C_FINITE);
2912 : : }
2913 : : else
2914 : : /* Clear the existing niter information to make sure the nonwrapping flag
2915 : : will be calculated and set propriately. */
2916 : 204829 : free_numbers_of_iterations_estimates (loop);
2917 : :
2918 : 214905 : auto_vector_modes vector_modes;
2919 : : /* Autodetect first vector size we try. */
2920 : 214905 : vector_modes.safe_push (VOIDmode);
2921 : 214905 : unsigned int autovec_flags
2922 : 429810 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2923 : 214905 : loop->simdlen != 0);
2924 : 214905 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2925 : 214905 : && !unlimited_cost_model (loop));
2926 : 214905 : machine_mode autodetected_vector_mode = VOIDmode;
2927 : 214905 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2928 : 214905 : unsigned int mode_i = 0;
2929 : 214905 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2930 : :
2931 : : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
2932 : : a mode has not been analyzed. */
2933 : 214905 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
2934 : 2169142 : for (unsigned i = 0; i < vector_modes.length (); ++i)
2935 : 869666 : cached_vf_per_mode.safe_push (0);
2936 : :
2937 : : /* First determine the main loop vectorization mode, either the first
2938 : : one that works, starting with auto-detecting the vector mode and then
2939 : : following the targets order of preference, or the one with the
2940 : : lowest cost if pick_lowest_cost_p. */
2941 : 582393 : while (1)
2942 : : {
2943 : 398649 : bool fatal;
2944 : 398649 : unsigned int last_mode_i = mode_i;
2945 : : /* Set cached VF to -1 prior to analysis, which indicates a mode has
2946 : : failed. */
2947 : 398649 : cached_vf_per_mode[last_mode_i] = -1;
2948 : 398649 : opt_loop_vec_info loop_vinfo
2949 : 398649 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
2950 : : NULL, vector_modes, mode_i, -1,
2951 : : autodetected_vector_mode, fatal);
2952 : 398649 : if (fatal)
2953 : : break;
2954 : :
2955 : 333219 : if (loop_vinfo)
2956 : : {
2957 : : /* Analyzis has been successful so update the VF value. The
2958 : : VF should always be a multiple of unroll_factor and we want to
2959 : : capture the original VF here. */
2960 : 53657 : cached_vf_per_mode[last_mode_i]
2961 : 53657 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2962 : 53657 : loop_vinfo->suggested_unroll_factor);
2963 : : /* Once we hit the desired simdlen for the first time,
2964 : : discard any previous attempts. */
2965 : 53657 : if (simdlen
2966 : 53657 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2967 : : {
2968 : 47 : delete first_loop_vinfo;
2969 : : first_loop_vinfo = opt_loop_vec_info::success (NULL);
2970 : : simdlen = 0;
2971 : : }
2972 : 53610 : else if (pick_lowest_cost_p
2973 : 0 : && first_loop_vinfo
2974 : 53610 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2975 : : {
2976 : : /* Pick loop_vinfo over first_loop_vinfo. */
2977 : 0 : delete first_loop_vinfo;
2978 : 0 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
2979 : : }
2980 : 53657 : if (first_loop_vinfo == NULL)
2981 : : first_loop_vinfo = loop_vinfo;
2982 : : else
2983 : : {
2984 : 2 : delete loop_vinfo;
2985 : 2 : loop_vinfo = opt_loop_vec_info::success (NULL);
2986 : : }
2987 : :
2988 : : /* Commit to first_loop_vinfo if we have no reason to try
2989 : : alternatives. */
2990 : 53657 : if (!simdlen && !pick_lowest_cost_p)
2991 : : break;
2992 : : }
2993 : 279571 : if (mode_i == vector_modes.length ()
2994 : 279571 : || autodetected_vector_mode == VOIDmode)
2995 : : break;
2996 : :
2997 : : /* Try the next biggest vector size. */
2998 : 183744 : if (dump_enabled_p ())
2999 : 3730 : dump_printf_loc (MSG_NOTE, vect_location,
3000 : : "***** Re-trying analysis with vector mode %s\n",
3001 : 3730 : GET_MODE_NAME (vector_modes[mode_i]));
3002 : 183744 : }
3003 : 214905 : if (!first_loop_vinfo)
3004 : 161255 : return opt_loop_vec_info::propagate_failure (res);
3005 : :
3006 : 53650 : if (dump_enabled_p ())
3007 : 9095 : dump_printf_loc (MSG_NOTE, vect_location,
3008 : : "***** Choosing vector mode %s\n",
3009 : 9095 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3010 : :
3011 : : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3012 : : enabled, SIMDUID is not set, it is the innermost loop and we have
3013 : : either already found the loop's SIMDLEN or there was no SIMDLEN to
3014 : : begin with.
3015 : : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3016 : 53650 : bool vect_epilogues = (!simdlen
3017 : 53648 : && loop->inner == NULL
3018 : 53133 : && param_vect_epilogues_nomask
3019 : 52093 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3020 : : /* No code motion support for multiple epilogues so for now
3021 : : not supported when multiple exits. */
3022 : 25550 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3023 : 25094 : && !loop->simduid
3024 : 77331 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3025 : 53650 : if (!vect_epilogues)
3026 : 41036 : return first_loop_vinfo;
3027 : :
3028 : : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3029 : :
3030 : : /* For epilogues start the analysis from the first mode. The motivation
3031 : : behind starting from the beginning comes from cases where the VECTOR_MODES
3032 : : array may contain length-agnostic and length-specific modes. Their
3033 : : ordering is not guaranteed, so we could end up picking a mode for the main
3034 : : loop that is after the epilogue's optimal mode. */
3035 : 12614 : int masked_p = -1;
3036 : 12614 : if (!unlimited_cost_model (loop)
3037 : 12614 : && (first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3038 : : != VOIDmode))
3039 : : {
3040 : 4 : vector_modes[0]
3041 : 4 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3042 : 4 : cached_vf_per_mode[0] = 0;
3043 : : }
3044 : : else
3045 : 12610 : vector_modes[0] = autodetected_vector_mode;
3046 : 12614 : mode_i = 0;
3047 : :
3048 : 12650 : bool supports_partial_vectors = (param_vect_partial_vector_usage != 0
3049 : 12614 : || masked_p == 1);
3050 : : if (supports_partial_vectors
3051 : 36 : && !partial_vectors_supported_p ()
3052 : 36 : && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo))
3053 : : supports_partial_vectors = false;
3054 : 12614 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3055 : :
3056 : 12614 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3057 : 12802 : do
3058 : : {
3059 : : /* Let the user override what the target suggests. */
3060 : 12708 : if (OPTION_SET_P (param_vect_partial_vector_usage))
3061 : 43 : masked_p = -1;
3062 : :
3063 : 43591 : while (1)
3064 : : {
3065 : : /* If the target does not support partial vectors we can shorten the
3066 : : number of modes to analyze for the epilogue as we know we can't
3067 : : pick a mode that would lead to a VF at least as big as the
3068 : : FIRST_VINFO_VF. */
3069 : 57056 : if (!supports_partial_vectors
3070 : 43591 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3071 : : {
3072 : 13490 : mode_i++;
3073 : 26980 : if (mode_i == vector_modes.length ())
3074 : : break;
3075 : 25837 : continue;
3076 : : }
3077 : : /* We would need an exhaustive search to find all modes we
3078 : : skipped but that would lead to the same result as the
3079 : : analysis it was skipped for and where we'd could check
3080 : : cached_vf_per_mode against.
3081 : : Check for the autodetected mode, which is the common
3082 : : situation on x86 which does not perform cost comparison. */
3083 : 42473 : if (!supports_partial_vectors
3084 : 30091 : && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf)
3085 : 59653 : && vect_chooses_same_modes_p (autodetected_vector_mode,
3086 : 29552 : vector_modes[mode_i]))
3087 : : {
3088 : 12372 : mode_i++;
3089 : 24744 : if (mode_i == vector_modes.length ())
3090 : : break;
3091 : 12372 : continue;
3092 : : }
3093 : :
3094 : 17729 : if (dump_enabled_p ())
3095 : 3173 : dump_printf_loc (MSG_NOTE, vect_location,
3096 : : "***** Re-trying epilogue analysis with vector "
3097 : 3173 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3098 : :
3099 : 17729 : bool fatal;
3100 : 17729 : opt_loop_vec_info loop_vinfo
3101 : 17729 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3102 : : orig_loop_vinfo,
3103 : : vector_modes, mode_i, masked_p,
3104 : : autodetected_vector_mode, fatal);
3105 : 17729 : if (fatal)
3106 : : break;
3107 : :
3108 : 17729 : if (loop_vinfo)
3109 : : {
3110 : 6819 : if (pick_lowest_cost_p
3111 : 0 : && orig_loop_vinfo->epilogue_vinfo
3112 : 6819 : && vect_joust_loop_vinfos (loop_vinfo,
3113 : 0 : orig_loop_vinfo->epilogue_vinfo))
3114 : : {
3115 : 0 : gcc_assert (vect_epilogues);
3116 : 0 : delete orig_loop_vinfo->epilogue_vinfo;
3117 : 0 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3118 : : }
3119 : 6819 : if (!orig_loop_vinfo->epilogue_vinfo)
3120 : 6819 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3121 : : else
3122 : : {
3123 : 0 : delete loop_vinfo;
3124 : 0 : loop_vinfo = opt_loop_vec_info::success (NULL);
3125 : : }
3126 : :
3127 : : /* For now only allow one epilogue loop, but allow
3128 : : pick_lowest_cost_p to replace it, so commit to the
3129 : : first epilogue if we have no reason to try alternatives. */
3130 : 6819 : if (!pick_lowest_cost_p)
3131 : : break;
3132 : : }
3133 : :
3134 : : /* Revert back to the default from the suggested prefered
3135 : : epilogue vectorization mode. */
3136 : 10910 : masked_p = -1;
3137 : 21820 : if (mode_i == vector_modes.length ())
3138 : : break;
3139 : : }
3140 : :
3141 : 12708 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3142 : 12708 : if (!orig_loop_vinfo)
3143 : : break;
3144 : :
3145 : : /* When we selected a first vectorized epilogue, see if the target
3146 : : suggests to have another one. */
3147 : 6819 : masked_p = -1;
3148 : 6819 : if (!unlimited_cost_model (loop)
3149 : 3946 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo)
3150 : 10758 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p)
3151 : : != VOIDmode))
3152 : : {
3153 : 188 : vector_modes[0]
3154 : 94 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode (masked_p);
3155 : 94 : cached_vf_per_mode[0] = 0;
3156 : 94 : mode_i = 0;
3157 : : }
3158 : : else
3159 : : break;
3160 : 94 : }
3161 : : while (1);
3162 : :
3163 : 12614 : if (first_loop_vinfo->epilogue_vinfo)
3164 : : {
3165 : 6730 : poly_uint64 lowest_th
3166 : 6730 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3167 : 6730 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3168 : 6819 : do
3169 : : {
3170 : 6819 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3171 : 6819 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3172 : : || maybe_ne (lowest_th, 0U));
3173 : : /* Keep track of the known smallest versioning threshold. */
3174 : 6819 : if (ordered_p (lowest_th, th))
3175 : 6819 : lowest_th = ordered_min (lowest_th, th);
3176 : 6819 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3177 : : }
3178 : 6819 : while (epilog_vinfo);
3179 : 6730 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3180 : 6730 : if (dump_enabled_p ())
3181 : 1375 : dump_printf_loc (MSG_NOTE, vect_location,
3182 : : "***** Choosing epilogue vector mode %s\n",
3183 : 1375 : GET_MODE_NAME
3184 : : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3185 : : }
3186 : :
3187 : 12614 : return first_loop_vinfo;
3188 : 671180 : }
3189 : :
3190 : : /* Return true if there is an in-order reduction function for CODE, storing
3191 : : it in *REDUC_FN if so. */
3192 : :
3193 : : static bool
3194 : 4716 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3195 : : {
3196 : : /* We support MINUS_EXPR by negating the operand. This also preserves an
3197 : : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3198 : : (-0.0) = -0.0. */
3199 : 4716 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3200 : : {
3201 : 4040 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3202 : 0 : return true;
3203 : : }
3204 : : return false;
3205 : : }
3206 : :
3207 : : /* Function reduction_fn_for_scalar_code
3208 : :
3209 : : Input:
3210 : : CODE - tree_code of a reduction operations.
3211 : :
3212 : : Output:
3213 : : REDUC_FN - the corresponding internal function to be used to reduce the
3214 : : vector of partial results into a single scalar result, or IFN_LAST
3215 : : if the operation is a supported reduction operation, but does not have
3216 : : such an internal function.
3217 : :
3218 : : Return FALSE if CODE currently cannot be vectorized as reduction. */
3219 : :
3220 : : bool
3221 : 2026039 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3222 : : {
3223 : 2026039 : if (code.is_tree_code ())
3224 : 2025985 : switch (tree_code (code))
3225 : : {
3226 : 14202 : case MAX_EXPR:
3227 : 14202 : *reduc_fn = IFN_REDUC_MAX;
3228 : 14202 : return true;
3229 : :
3230 : 50687 : case MIN_EXPR:
3231 : 50687 : *reduc_fn = IFN_REDUC_MIN;
3232 : 50687 : return true;
3233 : :
3234 : 1094362 : case PLUS_EXPR:
3235 : 1094362 : *reduc_fn = IFN_REDUC_PLUS;
3236 : 1094362 : return true;
3237 : :
3238 : 257180 : case BIT_AND_EXPR:
3239 : 257180 : *reduc_fn = IFN_REDUC_AND;
3240 : 257180 : return true;
3241 : :
3242 : 286329 : case BIT_IOR_EXPR:
3243 : 286329 : *reduc_fn = IFN_REDUC_IOR;
3244 : 286329 : return true;
3245 : :
3246 : 43715 : case BIT_XOR_EXPR:
3247 : 43715 : *reduc_fn = IFN_REDUC_XOR;
3248 : 43715 : return true;
3249 : :
3250 : 279510 : case MULT_EXPR:
3251 : 279510 : case MINUS_EXPR:
3252 : 279510 : *reduc_fn = IFN_LAST;
3253 : 279510 : return true;
3254 : :
3255 : : default:
3256 : : return false;
3257 : : }
3258 : : else
3259 : 54 : switch (combined_fn (code))
3260 : : {
3261 : 30 : CASE_CFN_FMAX:
3262 : 30 : *reduc_fn = IFN_REDUC_FMAX;
3263 : 30 : return true;
3264 : :
3265 : 24 : CASE_CFN_FMIN:
3266 : 24 : *reduc_fn = IFN_REDUC_FMIN;
3267 : 24 : return true;
3268 : :
3269 : : default:
3270 : : return false;
3271 : : }
3272 : : }
3273 : :
3274 : : /* Set *SBOOL_FN to the corresponding function working on vector masks
3275 : : for REDUC_FN. Return true if that exists, false otherwise. */
3276 : :
3277 : : static bool
3278 : 0 : sbool_reduction_fn_for_fn (internal_fn reduc_fn, internal_fn *sbool_fn)
3279 : : {
3280 : 0 : switch (reduc_fn)
3281 : : {
3282 : 0 : case IFN_REDUC_AND:
3283 : 0 : *sbool_fn = IFN_REDUC_SBOOL_AND;
3284 : 0 : return true;
3285 : 0 : case IFN_REDUC_IOR:
3286 : 0 : *sbool_fn = IFN_REDUC_SBOOL_IOR;
3287 : 0 : return true;
3288 : 0 : case IFN_REDUC_XOR:
3289 : 0 : *sbool_fn = IFN_REDUC_SBOOL_XOR;
3290 : 0 : return true;
3291 : : default:
3292 : : return false;
3293 : : }
3294 : : }
3295 : :
3296 : : /* If there is a neutral value X such that a reduction would not be affected
3297 : : by the introduction of additional X elements, return that X, otherwise
3298 : : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3299 : : of the scalar elements. If the reduction has just a single initial value
3300 : : then INITIAL_VALUE is that value, otherwise it is null.
3301 : : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3302 : : In that case no signed zero is returned. */
3303 : :
3304 : : tree
3305 : 77128 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3306 : : tree initial_value, bool as_initial)
3307 : : {
3308 : 77128 : if (code.is_tree_code ())
3309 : 77074 : switch (tree_code (code))
3310 : : {
3311 : 11573 : case DOT_PROD_EXPR:
3312 : 11573 : case SAD_EXPR:
3313 : 11573 : case MINUS_EXPR:
3314 : 11573 : case BIT_IOR_EXPR:
3315 : 11573 : case BIT_XOR_EXPR:
3316 : 11573 : return build_zero_cst (scalar_type);
3317 : 59528 : case WIDEN_SUM_EXPR:
3318 : 59528 : case PLUS_EXPR:
3319 : 59528 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3320 : 56 : return build_real (scalar_type, dconstm0);
3321 : : else
3322 : 59472 : return build_zero_cst (scalar_type);
3323 : :
3324 : 2009 : case MULT_EXPR:
3325 : 2009 : return build_one_cst (scalar_type);
3326 : :
3327 : 1446 : case BIT_AND_EXPR:
3328 : 1446 : return build_all_ones_cst (scalar_type);
3329 : :
3330 : : case MAX_EXPR:
3331 : : case MIN_EXPR:
3332 : : return initial_value;
3333 : :
3334 : 436 : default:
3335 : 436 : return NULL_TREE;
3336 : : }
3337 : : else
3338 : 54 : switch (combined_fn (code))
3339 : : {
3340 : : CASE_CFN_FMIN:
3341 : : CASE_CFN_FMAX:
3342 : : return initial_value;
3343 : :
3344 : 0 : default:
3345 : 0 : return NULL_TREE;
3346 : : }
3347 : : }
3348 : :
3349 : : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3350 : : STMT is printed with a message MSG. */
3351 : :
3352 : : static void
3353 : 489 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3354 : : {
3355 : 489 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3356 : 489 : }
3357 : :
3358 : : /* Return true if we need an in-order reduction for operation CODE
3359 : : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3360 : : overflow must wrap. */
3361 : :
3362 : : bool
3363 : 6493757 : needs_fold_left_reduction_p (tree type, code_helper code)
3364 : : {
3365 : : /* CHECKME: check for !flag_finite_math_only too? */
3366 : 6493757 : if (SCALAR_FLOAT_TYPE_P (type))
3367 : : {
3368 : 546992 : if (code.is_tree_code ())
3369 : 546942 : switch (tree_code (code))
3370 : : {
3371 : : case MIN_EXPR:
3372 : : case MAX_EXPR:
3373 : : return false;
3374 : :
3375 : 545303 : default:
3376 : 545303 : return !flag_associative_math;
3377 : : }
3378 : : else
3379 : 50 : switch (combined_fn (code))
3380 : : {
3381 : : CASE_CFN_FMIN:
3382 : : CASE_CFN_FMAX:
3383 : : return false;
3384 : :
3385 : 2 : default:
3386 : 2 : return !flag_associative_math;
3387 : : }
3388 : : }
3389 : :
3390 : 5946765 : if (INTEGRAL_TYPE_P (type))
3391 : 5945936 : return (!code.is_tree_code ()
3392 : 5945936 : || !operation_no_trapping_overflow (type, tree_code (code)));
3393 : :
3394 : 829 : if (SAT_FIXED_POINT_TYPE_P (type))
3395 : : return true;
3396 : :
3397 : : return false;
3398 : : }
3399 : :
3400 : : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3401 : : has a handled computation expression. Store the main reduction
3402 : : operation in *CODE. */
3403 : :
3404 : : static bool
3405 : 64499 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3406 : : tree loop_arg, code_helper *code,
3407 : : vec<std::pair<ssa_op_iter, use_operand_p> > &path,
3408 : : bool inner_loop_of_double_reduc)
3409 : : {
3410 : 64499 : auto_bitmap visited;
3411 : 64499 : tree lookfor = PHI_RESULT (phi);
3412 : 64499 : ssa_op_iter curri;
3413 : 64499 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3414 : 135501 : while (USE_FROM_PTR (curr) != loop_arg)
3415 : 6503 : curr = op_iter_next_use (&curri);
3416 : 64499 : curri.i = curri.numops;
3417 : 623667 : do
3418 : : {
3419 : 623667 : path.safe_push (std::make_pair (curri, curr));
3420 : 623667 : tree use = USE_FROM_PTR (curr);
3421 : 623667 : if (use == lookfor)
3422 : : break;
3423 : 559459 : gimple *def = SSA_NAME_DEF_STMT (use);
3424 : 559459 : if (gimple_nop_p (def)
3425 : 559459 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3426 : : {
3427 : 473253 : pop:
3428 : 473253 : do
3429 : : {
3430 : 473253 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3431 : 473253 : curri = x.first;
3432 : 473253 : curr = x.second;
3433 : 519715 : do
3434 : 519715 : curr = op_iter_next_use (&curri);
3435 : : /* Skip already visited or non-SSA operands (from iterating
3436 : : over PHI args). */
3437 : : while (curr != NULL_USE_OPERAND_P
3438 : 1039430 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3439 : 178978 : || ! bitmap_set_bit (visited,
3440 : 178978 : SSA_NAME_VERSION
3441 : : (USE_FROM_PTR (curr)))));
3442 : : }
3443 : 946506 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3444 : 159046 : if (curr == NULL_USE_OPERAND_P)
3445 : : break;
3446 : : }
3447 : : else
3448 : : {
3449 : 469973 : if (gimple_code (def) == GIMPLE_PHI)
3450 : 48800 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3451 : : else
3452 : 421173 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3453 : : while (curr != NULL_USE_OPERAND_P
3454 : 565511 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3455 : 492127 : || ! bitmap_set_bit (visited,
3456 : 492127 : SSA_NAME_VERSION
3457 : : (USE_FROM_PTR (curr)))))
3458 : 95538 : curr = op_iter_next_use (&curri);
3459 : 469973 : if (curr == NULL_USE_OPERAND_P)
3460 : 69560 : goto pop;
3461 : : }
3462 : : }
3463 : : while (1);
3464 : 64499 : if (dump_file && (dump_flags & TDF_DETAILS))
3465 : : {
3466 : 3762 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3467 : 3762 : unsigned i;
3468 : 3762 : std::pair<ssa_op_iter, use_operand_p> *x;
3469 : 12848 : FOR_EACH_VEC_ELT (path, i, x)
3470 : 9086 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3471 : 3762 : dump_printf (MSG_NOTE, "\n");
3472 : : }
3473 : :
3474 : : /* Check whether the reduction path detected is valid. */
3475 : 64499 : bool fail = path.length () == 0;
3476 : 64499 : bool neg = false;
3477 : 64499 : int sign = -1;
3478 : 64499 : *code = ERROR_MARK;
3479 : 141703 : for (unsigned i = 1; i < path.length (); ++i)
3480 : : {
3481 : 80142 : gimple *use_stmt = USE_STMT (path[i].second);
3482 : 80142 : gimple_match_op op;
3483 : 80142 : if (!gimple_extract_op (use_stmt, &op))
3484 : : {
3485 : : fail = true;
3486 : 2938 : break;
3487 : : }
3488 : 79583 : unsigned int opi = op.num_ops;
3489 : 79583 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3490 : : {
3491 : : /* The following make sure we can compute the operand index
3492 : : easily plus it mostly disallows chaining via COND_EXPR condition
3493 : : operands. */
3494 : 126200 : for (opi = 0; opi < op.num_ops; ++opi)
3495 : 125252 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3496 : : break;
3497 : : }
3498 : 3506 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3499 : : {
3500 : 7032 : for (opi = 0; opi < op.num_ops; ++opi)
3501 : 7032 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3502 : : break;
3503 : : }
3504 : 79583 : if (opi == op.num_ops)
3505 : : {
3506 : : fail = true;
3507 : : break;
3508 : : }
3509 : 78635 : op.code = canonicalize_code (op.code, op.type);
3510 : 78635 : if (op.code == MINUS_EXPR)
3511 : : {
3512 : 3844 : op.code = PLUS_EXPR;
3513 : : /* Track whether we negate the reduction value each iteration. */
3514 : 3844 : if (op.ops[1] == op.ops[opi])
3515 : 32 : neg = ! neg;
3516 : : }
3517 : 74791 : else if (op.code == IFN_COND_SUB)
3518 : : {
3519 : 2 : op.code = IFN_COND_ADD;
3520 : : /* Track whether we negate the reduction value each iteration. */
3521 : 2 : if (op.ops[2] == op.ops[opi])
3522 : 0 : neg = ! neg;
3523 : : }
3524 : : /* For an FMA the reduction code is the PLUS if the addition chain
3525 : : is the reduction. */
3526 : 74789 : else if (op.code == IFN_FMA && opi == 2)
3527 : 28 : op.code = PLUS_EXPR;
3528 : 78635 : if (CONVERT_EXPR_CODE_P (op.code)
3529 : 78635 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3530 : : ;
3531 : 75201 : else if (*code == ERROR_MARK)
3532 : : {
3533 : 62784 : *code = op.code;
3534 : 62784 : sign = TYPE_SIGN (op.type);
3535 : : }
3536 : 12417 : else if (op.code != *code)
3537 : : {
3538 : : fail = true;
3539 : : break;
3540 : : }
3541 : 11193 : else if ((op.code == MIN_EXPR
3542 : 11109 : || op.code == MAX_EXPR)
3543 : 11207 : && sign != TYPE_SIGN (op.type))
3544 : : {
3545 : : fail = true;
3546 : : break;
3547 : : }
3548 : : /* Check there's only a single stmt the op is used on. For the
3549 : : not value-changing tail and the last stmt allow out-of-loop uses,
3550 : : but not when this is the inner loop of a double reduction.
3551 : : ??? We could relax this and handle arbitrary live stmts by
3552 : : forcing a scalar epilogue for example. */
3553 : 77408 : imm_use_iterator imm_iter;
3554 : 77408 : use_operand_p use_p;
3555 : 77408 : gimple *op_use_stmt;
3556 : 77408 : unsigned cnt = 0;
3557 : 80884 : bool cond_fn_p = op.code.is_internal_fn ()
3558 : 3476 : && (conditional_internal_fn_code (internal_fn (op.code))
3559 : 77408 : != ERROR_MARK);
3560 : :
3561 : 257411 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3562 : : {
3563 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
3564 : : have op1 twice (once as definition, once as else) in the same
3565 : : operation. Enforce this. */
3566 : 102595 : if (cond_fn_p && op_use_stmt == use_stmt)
3567 : : {
3568 : 3420 : gcall *call = as_a<gcall *> (use_stmt);
3569 : 3420 : unsigned else_pos
3570 : 3420 : = internal_fn_else_index (internal_fn (op.code));
3571 : 3420 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
3572 : : {
3573 : : fail = true;
3574 : : break;
3575 : : }
3576 : 17100 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
3577 : : {
3578 : 13680 : if (j == else_pos)
3579 : 3420 : continue;
3580 : 10260 : if (gimple_call_arg (call, j) == op.ops[opi])
3581 : 3420 : cnt++;
3582 : : }
3583 : : }
3584 : 99175 : else if (!is_gimple_debug (op_use_stmt)
3585 : 99175 : && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
3586 : 1771 : || flow_bb_inside_loop_p (loop,
3587 : 1771 : gimple_bb (op_use_stmt))))
3588 : 148531 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3589 : 74270 : cnt++;
3590 : 77408 : }
3591 : :
3592 : 77408 : if (cnt != 1)
3593 : : {
3594 : : fail = true;
3595 : : break;
3596 : : }
3597 : : }
3598 : 67737 : return ! fail && ! neg && *code != ERROR_MARK;
3599 : 64499 : }
3600 : :
3601 : : bool
3602 : 21 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3603 : : tree loop_arg, enum tree_code code)
3604 : : {
3605 : 21 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3606 : 21 : code_helper code_;
3607 : 21 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
3608 : 21 : && code_ == code);
3609 : 21 : }
3610 : :
3611 : :
3612 : :
3613 : : /* Function vect_is_simple_reduction
3614 : :
3615 : : (1) Detect a cross-iteration def-use cycle that represents a simple
3616 : : reduction computation. We look for the following pattern:
3617 : :
3618 : : loop_header:
3619 : : a1 = phi < a0, a2 >
3620 : : a3 = ...
3621 : : a2 = operation (a3, a1)
3622 : :
3623 : : or
3624 : :
3625 : : a3 = ...
3626 : : loop_header:
3627 : : a1 = phi < a0, a2 >
3628 : : a2 = operation (a3, a1)
3629 : :
3630 : : such that:
3631 : : 1. operation is commutative and associative and it is safe to
3632 : : change the order of the computation
3633 : : 2. no uses for a2 in the loop (a2 is used out of the loop)
3634 : : 3. no uses of a1 in the loop besides the reduction operation
3635 : : 4. no uses of a1 outside the loop.
3636 : :
3637 : : Conditions 1,4 are tested here.
3638 : : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3639 : :
3640 : : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3641 : : nested cycles.
3642 : :
3643 : : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3644 : : reductions:
3645 : :
3646 : : a1 = phi < a0, a2 >
3647 : : inner loop (def of a3)
3648 : : a2 = phi < a3 >
3649 : :
3650 : : (4) Detect condition expressions, ie:
3651 : : for (int i = 0; i < N; i++)
3652 : : if (a[i] < val)
3653 : : ret_val = a[i];
3654 : :
3655 : : */
3656 : :
3657 : : static stmt_vec_info
3658 : 85696 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3659 : : gphi **double_reduc)
3660 : : {
3661 : 85696 : gphi *phi = as_a <gphi *> (phi_info->stmt);
3662 : 85696 : gimple *phi_use_stmt = NULL;
3663 : 85696 : imm_use_iterator imm_iter;
3664 : 85696 : use_operand_p use_p;
3665 : :
3666 : : /* When double_reduc is NULL we are testing the inner loop of a
3667 : : double reduction. */
3668 : 85696 : bool inner_loop_of_double_reduc = double_reduc == NULL;
3669 : 85696 : if (double_reduc)
3670 : 84670 : *double_reduc = NULL;
3671 : 85696 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3672 : :
3673 : 85696 : tree phi_name = PHI_RESULT (phi);
3674 : : /* ??? If there are no uses of the PHI result the inner loop reduction
3675 : : won't be detected as possibly double-reduction by vectorizable_reduction
3676 : : because that tries to walk the PHI arg from the preheader edge which
3677 : : can be constant. See PR60382. */
3678 : 85696 : if (has_zero_uses (phi_name))
3679 : : return NULL;
3680 : 85521 : class loop *loop = (gimple_bb (phi))->loop_father;
3681 : 85521 : unsigned nphi_def_loop_uses = 0;
3682 : 297923 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3683 : : {
3684 : 130690 : gimple *use_stmt = USE_STMT (use_p);
3685 : 130690 : if (is_gimple_debug (use_stmt))
3686 : 31433 : continue;
3687 : :
3688 : 99257 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3689 : : {
3690 : 3809 : if (dump_enabled_p ())
3691 : 30 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3692 : : "intermediate value used outside loop.\n");
3693 : :
3694 : 3809 : return NULL;
3695 : : }
3696 : :
3697 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
3698 : : op1 twice (once as definition, once as else) in the same operation.
3699 : : Only count it as one. */
3700 : 95448 : if (use_stmt != phi_use_stmt)
3701 : : {
3702 : 91767 : nphi_def_loop_uses++;
3703 : 91767 : phi_use_stmt = use_stmt;
3704 : : }
3705 : 3809 : }
3706 : :
3707 : 81712 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3708 : 81712 : if (TREE_CODE (latch_def) != SSA_NAME)
3709 : : {
3710 : 1233 : if (dump_enabled_p ())
3711 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3712 : : "reduction: not ssa_name: %T\n", latch_def);
3713 : 1233 : return NULL;
3714 : : }
3715 : :
3716 : 80479 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3717 : 80479 : if (!def_stmt_info
3718 : 80479 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3719 : 134 : return NULL;
3720 : :
3721 : 80345 : bool nested_in_vect_loop
3722 : 80345 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3723 : 80345 : unsigned nlatch_def_loop_uses = 0;
3724 : 80345 : auto_vec<gphi *, 3> lcphis;
3725 : 382924 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3726 : : {
3727 : 222234 : gimple *use_stmt = USE_STMT (use_p);
3728 : 222234 : if (is_gimple_debug (use_stmt))
3729 : 61333 : continue;
3730 : 160901 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3731 : 89027 : nlatch_def_loop_uses++;
3732 : : else
3733 : : /* We can have more than one loop-closed PHI. */
3734 : 71874 : lcphis.safe_push (as_a <gphi *> (use_stmt));
3735 : 80345 : }
3736 : :
3737 : : /* If we are vectorizing an inner reduction we are executing that
3738 : : in the original order only in case we are not dealing with a
3739 : : double reduction. */
3740 : 80345 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3741 : : {
3742 : 2192 : if (dump_enabled_p ())
3743 : 357 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3744 : : "detected nested cycle: ");
3745 : 2192 : return def_stmt_info;
3746 : : }
3747 : :
3748 : : /* When the inner loop of a double reduction ends up with more than
3749 : : one loop-closed PHI we have failed to classify alternate such
3750 : : PHIs as double reduction, leading to wrong code. See PR103237. */
3751 : 79167 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
3752 : : {
3753 : 1 : if (dump_enabled_p ())
3754 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3755 : : "unhandle double reduction\n");
3756 : 1 : return NULL;
3757 : : }
3758 : :
3759 : : /* If this isn't a nested cycle or if the nested cycle reduction value
3760 : : is used ouside of the inner loop we cannot handle uses of the reduction
3761 : : value. */
3762 : 78152 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3763 : : {
3764 : 12480 : if (dump_enabled_p ())
3765 : 314 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3766 : : "reduction used in loop.\n");
3767 : 12480 : return NULL;
3768 : : }
3769 : :
3770 : : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3771 : : defined in the inner loop. */
3772 : 65672 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3773 : : {
3774 : 1194 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
3775 : 1194 : if (gimple_phi_num_args (def_stmt) != 1
3776 : 1194 : || TREE_CODE (op1) != SSA_NAME)
3777 : : {
3778 : 38 : if (dump_enabled_p ())
3779 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3780 : : "unsupported phi node definition.\n");
3781 : :
3782 : 38 : return NULL;
3783 : : }
3784 : :
3785 : : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
3786 : : and the latch definition op1. */
3787 : 1156 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
3788 : 1156 : if (gimple_bb (def1)
3789 : 1156 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3790 : 1156 : && loop->inner
3791 : 1132 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3792 : 1132 : && (is_gimple_assign (def1) || is_gimple_call (def1))
3793 : 1123 : && is_a <gphi *> (phi_use_stmt)
3794 : 1112 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
3795 : 1112 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
3796 : : loop_latch_edge (loop->inner)))
3797 : 2266 : && lcphis.length () == 1)
3798 : : {
3799 : 1026 : if (dump_enabled_p ())
3800 : 132 : report_vect_op (MSG_NOTE, def_stmt,
3801 : : "detected double reduction: ");
3802 : :
3803 : 1026 : *double_reduc = as_a <gphi *> (phi_use_stmt);
3804 : 1026 : return def_stmt_info;
3805 : : }
3806 : :
3807 : 130 : return NULL;
3808 : : }
3809 : :
3810 : : /* Look for the expression computing latch_def from then loop PHI result. */
3811 : 64478 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3812 : 64478 : code_helper code;
3813 : 64478 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3814 : : path, inner_loop_of_double_reduc))
3815 : : {
3816 : 61240 : STMT_VINFO_REDUC_CODE (phi_info) = code;
3817 : 61240 : if (code == COND_EXPR && !nested_in_vect_loop)
3818 : 4170 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3819 : :
3820 : : /* Fill in STMT_VINFO_REDUC_IDX. */
3821 : 61240 : unsigned i;
3822 : 198180 : for (i = path.length () - 1; i >= 1; --i)
3823 : : {
3824 : 75700 : gimple *stmt = USE_STMT (path[i].second);
3825 : 75700 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3826 : 75700 : gimple_match_op op;
3827 : 75700 : if (!gimple_extract_op (stmt, &op))
3828 : 0 : gcc_unreachable ();
3829 : 75700 : if (gassign *assign = dyn_cast<gassign *> (stmt))
3830 : 72214 : STMT_VINFO_REDUC_IDX (stmt_info)
3831 : 72214 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3832 : : else
3833 : : {
3834 : 3486 : gcall *call = as_a<gcall *> (stmt);
3835 : 3486 : STMT_VINFO_REDUC_IDX (stmt_info)
3836 : 3486 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
3837 : : }
3838 : : }
3839 : 61240 : if (dump_enabled_p ())
3840 : 3706 : dump_printf_loc (MSG_NOTE, vect_location,
3841 : : "reduction: detected reduction\n");
3842 : :
3843 : 61240 : return def_stmt_info;
3844 : : }
3845 : :
3846 : 3238 : if (dump_enabled_p ())
3847 : 80 : dump_printf_loc (MSG_NOTE, vect_location,
3848 : : "reduction: unknown pattern\n");
3849 : :
3850 : : return NULL;
3851 : 144823 : }
3852 : :
3853 : : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3854 : : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3855 : : or -1 if not known. */
3856 : :
3857 : : static int
3858 : 352512 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3859 : : {
3860 : 352512 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3861 : 352512 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3862 : : {
3863 : 140443 : if (dump_enabled_p ())
3864 : 2813 : dump_printf_loc (MSG_NOTE, vect_location,
3865 : : "cost model: epilogue peel iters set to vf/2 "
3866 : : "because loop iterations are unknown .\n");
3867 : 140443 : return assumed_vf / 2;
3868 : : }
3869 : : else
3870 : : {
3871 : 212069 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3872 : 212069 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
3873 : 212069 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3874 : : /* If we need to peel for gaps, but no peeling is required, we have to
3875 : : peel VF iterations. */
3876 : 212069 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3877 : 212069 : peel_iters_epilogue = assumed_vf;
3878 : 212069 : return peel_iters_epilogue;
3879 : : }
3880 : : }
3881 : :
3882 : : /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3883 : : int
3884 : 269163 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3885 : : int *peel_iters_epilogue,
3886 : : stmt_vector_for_cost *scalar_cost_vec,
3887 : : stmt_vector_for_cost *prologue_cost_vec,
3888 : : stmt_vector_for_cost *epilogue_cost_vec)
3889 : : {
3890 : 269163 : int retval = 0;
3891 : :
3892 : 269163 : *peel_iters_epilogue
3893 : 269163 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3894 : :
3895 : 269163 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3896 : : {
3897 : : /* If peeled iterations are known but number of scalar loop
3898 : : iterations are unknown, count a taken branch per peeled loop. */
3899 : 90195 : if (peel_iters_prologue > 0)
3900 : 54307 : retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3901 : : vect_prologue);
3902 : 90195 : if (*peel_iters_epilogue > 0)
3903 : 90120 : retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3904 : : vect_epilogue);
3905 : : }
3906 : :
3907 : 269163 : stmt_info_for_cost *si;
3908 : 269163 : int j;
3909 : 269163 : if (peel_iters_prologue)
3910 : 571409 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3911 : 456360 : retval += record_stmt_cost (prologue_cost_vec,
3912 : 456360 : si->count * peel_iters_prologue,
3913 : : si->kind, si->stmt_info, si->misalign,
3914 : : vect_prologue);
3915 : 269163 : if (*peel_iters_epilogue)
3916 : 905098 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3917 : 725091 : retval += record_stmt_cost (epilogue_cost_vec,
3918 : 725091 : si->count * *peel_iters_epilogue,
3919 : : si->kind, si->stmt_info, si->misalign,
3920 : : vect_epilogue);
3921 : :
3922 : 269163 : return retval;
3923 : : }
3924 : :
3925 : : /* Function vect_estimate_min_profitable_iters
3926 : :
3927 : : Return the number of iterations required for the vector version of the
3928 : : loop to be profitable relative to the cost of the scalar version of the
3929 : : loop.
3930 : :
3931 : : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3932 : : of iterations for vectorization. -1 value means loop vectorization
3933 : : is not profitable. This returned value may be used for dynamic
3934 : : profitability check.
3935 : :
3936 : : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3937 : : for static check against estimated number of iterations. */
3938 : :
3939 : : static void
3940 : 99930 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3941 : : int *ret_min_profitable_niters,
3942 : : int *ret_min_profitable_estimate,
3943 : : unsigned *suggested_unroll_factor)
3944 : : {
3945 : 99930 : int min_profitable_iters;
3946 : 99930 : int min_profitable_estimate;
3947 : 99930 : int peel_iters_prologue;
3948 : 99930 : int peel_iters_epilogue;
3949 : 99930 : unsigned vec_inside_cost = 0;
3950 : 99930 : int vec_outside_cost = 0;
3951 : 99930 : unsigned vec_prologue_cost = 0;
3952 : 99930 : unsigned vec_epilogue_cost = 0;
3953 : 99930 : int scalar_single_iter_cost = 0;
3954 : 99930 : int scalar_outside_cost = 0;
3955 : 99930 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3956 : 99930 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3957 : 99930 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
3958 : :
3959 : : /* Cost model disabled. */
3960 : 99930 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3961 : : {
3962 : 16376 : if (dump_enabled_p ())
3963 : 10097 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3964 : 16376 : *ret_min_profitable_niters = 0;
3965 : 16376 : *ret_min_profitable_estimate = 0;
3966 : 16376 : return;
3967 : : }
3968 : :
3969 : : /* Requires loop versioning tests to handle misalignment. */
3970 : 83554 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3971 : : {
3972 : : /* FIXME: Make cost depend on complexity of individual check. */
3973 : 28 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3974 : 28 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3975 : 28 : if (dump_enabled_p ())
3976 : 1 : dump_printf (MSG_NOTE,
3977 : : "cost model: Adding cost of checks for loop "
3978 : : "versioning to treat misalignment.\n");
3979 : : }
3980 : :
3981 : : /* Requires loop versioning with alias checks. */
3982 : 83554 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3983 : : {
3984 : : /* FIXME: Make cost depend on complexity of individual check. */
3985 : 4041 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3986 : 4041 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
3987 : 4041 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3988 : 0 : if (len)
3989 : : /* Count LEN - 1 ANDs and LEN comparisons. */
3990 : 0 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3991 : : scalar_stmt, vect_prologue);
3992 : 4041 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3993 : 1090 : if (len)
3994 : : {
3995 : : /* Count LEN - 1 ANDs and LEN comparisons. */
3996 : 1090 : unsigned int nstmts = len * 2 - 1;
3997 : : /* +1 for each bias that needs adding. */
3998 : 2180 : for (unsigned int i = 0; i < len; ++i)
3999 : 1090 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4000 : 123 : nstmts += 1;
4001 : 1090 : (void) add_stmt_cost (target_cost_data, nstmts,
4002 : : scalar_stmt, vect_prologue);
4003 : : }
4004 : 4041 : if (dump_enabled_p ())
4005 : 16 : dump_printf (MSG_NOTE,
4006 : : "cost model: Adding cost of checks for loop "
4007 : : "versioning aliasing.\n");
4008 : : }
4009 : :
4010 : : /* Requires loop versioning with niter checks. */
4011 : 83554 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4012 : : {
4013 : : /* FIXME: Make cost depend on complexity of individual check. */
4014 : 664 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4015 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4016 : 664 : if (dump_enabled_p ())
4017 : 1 : dump_printf (MSG_NOTE,
4018 : : "cost model: Adding cost of checks for loop "
4019 : : "versioning niters.\n");
4020 : : }
4021 : :
4022 : 83554 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4023 : 4719 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4024 : : vect_prologue);
4025 : :
4026 : : /* Count statements in scalar loop. Using this as scalar cost for a single
4027 : : iteration for now.
4028 : :
4029 : : TODO: Add outer loop support.
4030 : :
4031 : : TODO: Consider assigning different costs to different scalar
4032 : : statements. */
4033 : :
4034 : 83554 : scalar_single_iter_cost = (loop_vinfo->scalar_costs->total_cost ()
4035 : 83554 : * param_vect_scalar_cost_multiplier) / 100;
4036 : :
4037 : : /* Add additional cost for the peeled instructions in prologue and epilogue
4038 : : loop. (For fully-masked loops there will be no peeling.)
4039 : :
4040 : : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4041 : : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4042 : :
4043 : : TODO: Build an expression that represents peel_iters for prologue and
4044 : : epilogue to be used in a run-time test. */
4045 : :
4046 : 83554 : bool prologue_need_br_taken_cost = false;
4047 : 83554 : bool prologue_need_br_not_taken_cost = false;
4048 : :
4049 : : /* Calculate peel_iters_prologue. */
4050 : 83554 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4051 : : peel_iters_prologue = 0;
4052 : 83554 : else if (npeel < 0)
4053 : : {
4054 : 183 : peel_iters_prologue = assumed_vf / 2;
4055 : 183 : if (dump_enabled_p ())
4056 : 4 : dump_printf (MSG_NOTE, "cost model: "
4057 : : "prologue peel iters set to vf/2.\n");
4058 : :
4059 : : /* If peeled iterations are unknown, count a taken branch and a not taken
4060 : : branch per peeled loop. Even if scalar loop iterations are known,
4061 : : vector iterations are not known since peeled prologue iterations are
4062 : : not known. Hence guards remain the same. */
4063 : : prologue_need_br_taken_cost = true;
4064 : : prologue_need_br_not_taken_cost = true;
4065 : : }
4066 : : else
4067 : : {
4068 : 83371 : peel_iters_prologue = npeel;
4069 : 83371 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4070 : : /* If peeled iterations are known but number of scalar loop
4071 : : iterations are unknown, count a taken branch per peeled loop. */
4072 : 83554 : prologue_need_br_taken_cost = true;
4073 : : }
4074 : :
4075 : 83554 : bool epilogue_need_br_taken_cost = false;
4076 : 83554 : bool epilogue_need_br_not_taken_cost = false;
4077 : :
4078 : : /* Calculate peel_iters_epilogue. */
4079 : 83554 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4080 : : /* We need to peel exactly one iteration for gaps. */
4081 : 22 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4082 : 83532 : else if (npeel < 0)
4083 : : {
4084 : : /* If peeling for alignment is unknown, loop bound of main loop
4085 : : becomes unknown. */
4086 : 183 : peel_iters_epilogue = assumed_vf / 2;
4087 : 183 : if (dump_enabled_p ())
4088 : 4 : dump_printf (MSG_NOTE, "cost model: "
4089 : : "epilogue peel iters set to vf/2 because "
4090 : : "peeling for alignment is unknown.\n");
4091 : :
4092 : : /* See the same reason above in peel_iters_prologue calculation. */
4093 : : epilogue_need_br_taken_cost = true;
4094 : : epilogue_need_br_not_taken_cost = true;
4095 : : }
4096 : : else
4097 : : {
4098 : 83349 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4099 : 83349 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4100 : : /* If peeled iterations are known but number of scalar loop
4101 : : iterations are unknown, count a taken branch per peeled loop. */
4102 : 83554 : epilogue_need_br_taken_cost = true;
4103 : : }
4104 : :
4105 : 83554 : stmt_info_for_cost *si;
4106 : 83554 : int j;
4107 : : /* Add costs associated with peel_iters_prologue. */
4108 : 83554 : if (peel_iters_prologue)
4109 : 676 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4110 : : {
4111 : 483 : (void) add_stmt_cost (target_cost_data,
4112 : 483 : si->count * peel_iters_prologue, si->kind,
4113 : : si->stmt_info, si->node, si->vectype,
4114 : : si->misalign, vect_prologue);
4115 : : }
4116 : :
4117 : : /* Add costs associated with peel_iters_epilogue. */
4118 : 83554 : if (peel_iters_epilogue)
4119 : 283948 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4120 : : {
4121 : 224658 : (void) add_stmt_cost (target_cost_data,
4122 : 224658 : si->count * peel_iters_epilogue, si->kind,
4123 : : si->stmt_info, si->node, si->vectype,
4124 : : si->misalign, vect_epilogue);
4125 : : }
4126 : :
4127 : : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4128 : :
4129 : 83554 : if (prologue_need_br_taken_cost)
4130 : 184 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4131 : : vect_prologue);
4132 : :
4133 : 83554 : if (prologue_need_br_not_taken_cost)
4134 : 183 : (void) add_stmt_cost (target_cost_data, 1,
4135 : : cond_branch_not_taken, vect_prologue);
4136 : :
4137 : 83554 : if (epilogue_need_br_taken_cost)
4138 : 49832 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4139 : : vect_epilogue);
4140 : :
4141 : 83554 : if (epilogue_need_br_not_taken_cost)
4142 : 183 : (void) add_stmt_cost (target_cost_data, 1,
4143 : : cond_branch_not_taken, vect_epilogue);
4144 : :
4145 : : /* Take care of special costs for rgroup controls of partial vectors. */
4146 : 22 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4147 : 83576 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4148 : : == vect_partial_vectors_avx512))
4149 : : {
4150 : : /* Calculate how many masks we need to generate. */
4151 : 22 : unsigned int num_masks = 0;
4152 : 22 : bool need_saturation = false;
4153 : 90 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4154 : 24 : if (rgm.type)
4155 : : {
4156 : 22 : unsigned nvectors = rgm.factor;
4157 : 22 : num_masks += nvectors;
4158 : 22 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4159 : 22 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4160 : 7 : need_saturation = true;
4161 : : }
4162 : :
4163 : : /* ??? The target isn't able to identify the costs below as
4164 : : producing masks so it cannot penaltize cases where we'd run
4165 : : out of mask registers for example. */
4166 : :
4167 : : /* ??? We are also failing to account for smaller vector masks
4168 : : we generate by splitting larger masks in vect_get_loop_mask. */
4169 : :
4170 : : /* In the worst case, we need to generate each mask in the prologue
4171 : : and in the loop body. We need one splat per group and one
4172 : : compare per mask.
4173 : :
4174 : : Sometimes the prologue mask will fold to a constant,
4175 : : so the actual prologue cost might be smaller. However, it's
4176 : : simpler and safer to use the worst-case cost; if this ends up
4177 : : being the tie-breaker between vectorizing or not, then it's
4178 : : probably better not to vectorize. */
4179 : 22 : (void) add_stmt_cost (target_cost_data,
4180 : : num_masks
4181 : 22 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4182 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4183 : : vect_prologue);
4184 : 44 : (void) add_stmt_cost (target_cost_data,
4185 : : num_masks
4186 : 44 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4187 : : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4188 : :
4189 : : /* When we need saturation we need it both in the prologue and
4190 : : the epilogue. */
4191 : 22 : if (need_saturation)
4192 : : {
4193 : 7 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4194 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4195 : 7 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4196 : : NULL, NULL, NULL_TREE, 0, vect_body);
4197 : : }
4198 : : }
4199 : 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4200 : 83532 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4201 : : == vect_partial_vectors_while_ult))
4202 : : {
4203 : : /* Calculate how many masks we need to generate. */
4204 : : unsigned int num_masks = 0;
4205 : : rgroup_controls *rgm;
4206 : : unsigned int num_vectors_m1;
4207 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4208 : : num_vectors_m1, rgm)
4209 : 0 : if (rgm->type)
4210 : 0 : num_masks += num_vectors_m1 + 1;
4211 : 0 : gcc_assert (num_masks > 0);
4212 : :
4213 : : /* In the worst case, we need to generate each mask in the prologue
4214 : : and in the loop body. One of the loop body mask instructions
4215 : : replaces the comparison in the scalar loop, and since we don't
4216 : : count the scalar comparison against the scalar body, we shouldn't
4217 : : count that vector instruction against the vector body either.
4218 : :
4219 : : Sometimes we can use unpacks instead of generating prologue
4220 : : masks and sometimes the prologue mask will fold to a constant,
4221 : : so the actual prologue cost might be smaller. However, it's
4222 : : simpler and safer to use the worst-case cost; if this ends up
4223 : : being the tie-breaker between vectorizing or not, then it's
4224 : : probably better not to vectorize. */
4225 : 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4226 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4227 : : vect_prologue);
4228 : 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4229 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4230 : : vect_body);
4231 : : }
4232 : 83532 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4233 : : {
4234 : : /* Referring to the functions vect_set_loop_condition_partial_vectors
4235 : : and vect_set_loop_controls_directly, we need to generate each
4236 : : length in the prologue and in the loop body if required. Although
4237 : : there are some possible optimizations, we consider the worst case
4238 : : here. */
4239 : :
4240 : 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4241 : 0 : signed char partial_load_store_bias
4242 : : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4243 : 0 : bool need_iterate_p
4244 : 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4245 : 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4246 : :
4247 : : /* Calculate how many statements to be added. */
4248 : 0 : unsigned int prologue_stmts = 0;
4249 : 0 : unsigned int body_stmts = 0;
4250 : :
4251 : 0 : rgroup_controls *rgc;
4252 : 0 : unsigned int num_vectors_m1;
4253 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4254 : 0 : if (rgc->type)
4255 : : {
4256 : : /* May need one SHIFT for nitems_total computation. */
4257 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4258 : 0 : if (nitems != 1 && !niters_known_p)
4259 : 0 : prologue_stmts += 1;
4260 : :
4261 : : /* May need one MAX and one MINUS for wrap around. */
4262 : 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4263 : 0 : prologue_stmts += 2;
4264 : :
4265 : : /* Need one MAX and one MINUS for each batch limit excepting for
4266 : : the 1st one. */
4267 : 0 : prologue_stmts += num_vectors_m1 * 2;
4268 : :
4269 : 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4270 : :
4271 : : /* Need to set up lengths in prologue, only one MIN required
4272 : : for each since start index is zero. */
4273 : 0 : prologue_stmts += num_vectors;
4274 : :
4275 : : /* If we have a non-zero partial load bias, we need one PLUS
4276 : : to adjust the load length. */
4277 : 0 : if (partial_load_store_bias != 0)
4278 : 0 : body_stmts += 1;
4279 : :
4280 : 0 : unsigned int length_update_cost = 0;
4281 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4282 : : /* For decrement IV style, Each only need a single SELECT_VL
4283 : : or MIN since beginning to calculate the number of elements
4284 : : need to be processed in current iteration. */
4285 : : length_update_cost = 1;
4286 : : else
4287 : : /* For increment IV stype, Each may need two MINs and one MINUS to
4288 : : update lengths in body for next iteration. */
4289 : 0 : length_update_cost = 3;
4290 : :
4291 : 0 : if (need_iterate_p)
4292 : 0 : body_stmts += length_update_cost * num_vectors;
4293 : : }
4294 : :
4295 : 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4296 : : scalar_stmt, vect_prologue);
4297 : 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4298 : : scalar_stmt, vect_body);
4299 : : }
4300 : :
4301 : : /* FORNOW: The scalar outside cost is incremented in one of the
4302 : : following ways:
4303 : :
4304 : : 1. The vectorizer checks for alignment and aliasing and generates
4305 : : a condition that allows dynamic vectorization. A cost model
4306 : : check is ANDED with the versioning condition. Hence scalar code
4307 : : path now has the added cost of the versioning check.
4308 : :
4309 : : if (cost > th & versioning_check)
4310 : : jmp to vector code
4311 : :
4312 : : Hence run-time scalar is incremented by not-taken branch cost.
4313 : :
4314 : : 2. The vectorizer then checks if a prologue is required. If the
4315 : : cost model check was not done before during versioning, it has to
4316 : : be done before the prologue check.
4317 : :
4318 : : if (cost <= th)
4319 : : prologue = scalar_iters
4320 : : if (prologue == 0)
4321 : : jmp to vector code
4322 : : else
4323 : : execute prologue
4324 : : if (prologue == num_iters)
4325 : : go to exit
4326 : :
4327 : : Hence the run-time scalar cost is incremented by a taken branch,
4328 : : plus a not-taken branch, plus a taken branch cost.
4329 : :
4330 : : 3. The vectorizer then checks if an epilogue is required. If the
4331 : : cost model check was not done before during prologue check, it
4332 : : has to be done with the epilogue check.
4333 : :
4334 : : if (prologue == 0)
4335 : : jmp to vector code
4336 : : else
4337 : : execute prologue
4338 : : if (prologue == num_iters)
4339 : : go to exit
4340 : : vector code:
4341 : : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4342 : : jmp to epilogue
4343 : :
4344 : : Hence the run-time scalar cost should be incremented by 2 taken
4345 : : branches.
4346 : :
4347 : : TODO: The back end may reorder the BBS's differently and reverse
4348 : : conditions/branch directions. Change the estimates below to
4349 : : something more reasonable. */
4350 : :
4351 : : /* If the number of iterations is known and we do not do versioning, we can
4352 : : decide whether to vectorize at compile time. Hence the scalar version
4353 : : do not carry cost model guard costs. */
4354 : 33145 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4355 : 116699 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4356 : : {
4357 : : /* Cost model check occurs at versioning. */
4358 : 50997 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4359 : 4719 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4360 : : else
4361 : : {
4362 : : /* Cost model check occurs at prologue generation. */
4363 : 46278 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4364 : 38 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4365 : 38 : + vect_get_stmt_cost (cond_branch_not_taken);
4366 : : /* Cost model check occurs at epilogue generation. */
4367 : : else
4368 : 46240 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4369 : : }
4370 : : }
4371 : :
4372 : : /* Complete the target-specific cost calculations. */
4373 : 83554 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
4374 : 83554 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
4375 : 83554 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
4376 : 83554 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
4377 : 83554 : if (suggested_unroll_factor)
4378 : 83367 : *suggested_unroll_factor
4379 : 83367 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
4380 : :
4381 : 83367 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
4382 : 233 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4383 : 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4384 : : *suggested_unroll_factor,
4385 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4386 : : {
4387 : 0 : if (dump_enabled_p ())
4388 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4389 : : "can't unroll as unrolled vectorization factor larger"
4390 : : " than maximum vectorization factor: "
4391 : : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4392 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4393 : 0 : *suggested_unroll_factor = 1;
4394 : : }
4395 : :
4396 : 83554 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4397 : :
4398 : 83554 : if (dump_enabled_p ())
4399 : : {
4400 : 627 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4401 : 627 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4402 : : vec_inside_cost);
4403 : 627 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4404 : : vec_prologue_cost);
4405 : 627 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4406 : : vec_epilogue_cost);
4407 : 627 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4408 : : scalar_single_iter_cost);
4409 : 627 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4410 : : scalar_outside_cost);
4411 : 627 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4412 : : vec_outside_cost);
4413 : 627 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4414 : : peel_iters_prologue);
4415 : 627 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4416 : : peel_iters_epilogue);
4417 : : }
4418 : :
4419 : : /* Calculate number of iterations required to make the vector version
4420 : : profitable, relative to the loop bodies only. The following condition
4421 : : must hold true:
4422 : : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4423 : : where
4424 : : SIC = scalar iteration cost, VIC = vector iteration cost,
4425 : : VOC = vector outside cost, VF = vectorization factor,
4426 : : NPEEL = prologue iterations + epilogue iterations,
4427 : : SOC = scalar outside cost for run time cost model check. */
4428 : :
4429 : 83554 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4430 : 83554 : - vec_inside_cost);
4431 : 83554 : if (saving_per_viter <= 0)
4432 : : {
4433 : 26011 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4434 : 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4435 : : "vectorization did not happen for a simd loop");
4436 : :
4437 : 26011 : if (dump_enabled_p ())
4438 : 22 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4439 : : "cost model: the vector iteration cost = %d "
4440 : : "divided by the scalar iteration cost = %d "
4441 : : "is greater or equal to the vectorization factor = %d"
4442 : : ".\n",
4443 : : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4444 : 26011 : *ret_min_profitable_niters = -1;
4445 : 26011 : *ret_min_profitable_estimate = -1;
4446 : 26011 : return;
4447 : : }
4448 : :
4449 : : /* ??? The "if" arm is written to handle all cases; see below for what
4450 : : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4451 : 57543 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4452 : : {
4453 : : /* Rewriting the condition above in terms of the number of
4454 : : vector iterations (vniters) rather than the number of
4455 : : scalar iterations (niters) gives:
4456 : :
4457 : : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4458 : :
4459 : : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4460 : :
4461 : : For integer N, X and Y when X > 0:
4462 : :
4463 : : N * X > Y <==> N >= (Y /[floor] X) + 1. */
4464 : 14 : int outside_overhead = (vec_outside_cost
4465 : 14 : - scalar_single_iter_cost * peel_iters_prologue
4466 : 14 : - scalar_single_iter_cost * peel_iters_epilogue
4467 : : - scalar_outside_cost);
4468 : : /* We're only interested in cases that require at least one
4469 : : vector iteration. */
4470 : 14 : int min_vec_niters = 1;
4471 : 14 : if (outside_overhead > 0)
4472 : 11 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4473 : :
4474 : 14 : if (dump_enabled_p ())
4475 : 6 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4476 : : min_vec_niters);
4477 : :
4478 : 14 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4479 : : {
4480 : : /* Now that we know the minimum number of vector iterations,
4481 : : find the minimum niters for which the scalar cost is larger:
4482 : :
4483 : : SIC * niters > VIC * vniters + VOC - SOC
4484 : :
4485 : : We know that the minimum niters is no more than
4486 : : vniters * VF + NPEEL, but it might be (and often is) less
4487 : : than that if a partial vector iteration is cheaper than the
4488 : : equivalent scalar code. */
4489 : 14 : int threshold = (vec_inside_cost * min_vec_niters
4490 : 14 : + vec_outside_cost
4491 : 14 : - scalar_outside_cost);
4492 : 14 : if (threshold <= 0)
4493 : : min_profitable_iters = 1;
4494 : : else
4495 : 14 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4496 : : }
4497 : : else
4498 : : /* Convert the number of vector iterations into a number of
4499 : : scalar iterations. */
4500 : 0 : min_profitable_iters = (min_vec_niters * assumed_vf
4501 : 0 : + peel_iters_prologue
4502 : : + peel_iters_epilogue);
4503 : : }
4504 : : else
4505 : : {
4506 : 57529 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4507 : 57529 : * assumed_vf
4508 : 57529 : - vec_inside_cost * peel_iters_prologue
4509 : 57529 : - vec_inside_cost * peel_iters_epilogue);
4510 : 57529 : if (min_profitable_iters <= 0)
4511 : : min_profitable_iters = 0;
4512 : : else
4513 : : {
4514 : 48462 : min_profitable_iters /= saving_per_viter;
4515 : :
4516 : 48462 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4517 : 48462 : <= (((int) vec_inside_cost * min_profitable_iters)
4518 : 48462 : + (((int) vec_outside_cost - scalar_outside_cost)
4519 : : * assumed_vf)))
4520 : 48462 : min_profitable_iters++;
4521 : : }
4522 : : }
4523 : :
4524 : 57543 : if (dump_enabled_p ())
4525 : 605 : dump_printf (MSG_NOTE,
4526 : : " Calculated minimum iters for profitability: %d\n",
4527 : : min_profitable_iters);
4528 : :
4529 : 57543 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4530 : 57529 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4531 : : /* We want the vectorized loop to execute at least once. */
4532 : : min_profitable_iters = assumed_vf + peel_iters_prologue;
4533 : 10537 : else if (min_profitable_iters < peel_iters_prologue)
4534 : : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4535 : : vectorized loop executes at least once. */
4536 : : min_profitable_iters = peel_iters_prologue;
4537 : :
4538 : 57543 : if (dump_enabled_p ())
4539 : 605 : dump_printf_loc (MSG_NOTE, vect_location,
4540 : : " Runtime profitability threshold = %d\n",
4541 : : min_profitable_iters);
4542 : :
4543 : 57543 : *ret_min_profitable_niters = min_profitable_iters;
4544 : :
4545 : : /* Calculate number of iterations required to make the vector version
4546 : : profitable, relative to the loop bodies only.
4547 : :
4548 : : Non-vectorized variant is SIC * niters and it must win over vector
4549 : : variant on the expected loop trip count. The following condition must hold true:
4550 : : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4551 : :
4552 : 57543 : if (vec_outside_cost <= 0)
4553 : : min_profitable_estimate = 0;
4554 : : /* ??? This "else if" arm is written to handle all cases; see below for
4555 : : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4556 : 52188 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4557 : : {
4558 : : /* This is a repeat of the code above, but with + SOC rather
4559 : : than - SOC. */
4560 : 14 : int outside_overhead = (vec_outside_cost
4561 : 14 : - scalar_single_iter_cost * peel_iters_prologue
4562 : 14 : - scalar_single_iter_cost * peel_iters_epilogue
4563 : : + scalar_outside_cost);
4564 : 14 : int min_vec_niters = 1;
4565 : 14 : if (outside_overhead > 0)
4566 : 14 : min_vec_niters = outside_overhead / saving_per_viter + 1;
4567 : :
4568 : 14 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4569 : : {
4570 : 14 : int threshold = (vec_inside_cost * min_vec_niters
4571 : 14 : + vec_outside_cost
4572 : 14 : + scalar_outside_cost);
4573 : 14 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4574 : : }
4575 : : else
4576 : : min_profitable_estimate = (min_vec_niters * assumed_vf
4577 : : + peel_iters_prologue
4578 : : + peel_iters_epilogue);
4579 : : }
4580 : : else
4581 : : {
4582 : 52174 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4583 : 52174 : * assumed_vf
4584 : 52174 : - vec_inside_cost * peel_iters_prologue
4585 : 52174 : - vec_inside_cost * peel_iters_epilogue)
4586 : 52174 : / ((scalar_single_iter_cost * assumed_vf)
4587 : : - vec_inside_cost);
4588 : : }
4589 : 57543 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4590 : 57543 : if (dump_enabled_p ())
4591 : 605 : dump_printf_loc (MSG_NOTE, vect_location,
4592 : : " Static estimate profitability threshold = %d\n",
4593 : : min_profitable_estimate);
4594 : :
4595 : 57543 : *ret_min_profitable_estimate = min_profitable_estimate;
4596 : : }
4597 : :
4598 : : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4599 : : vector elements (not bits) for a vector with NELT elements. */
4600 : : static void
4601 : 2144 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4602 : : vec_perm_builder *sel)
4603 : : {
4604 : : /* The encoding is a single stepped pattern. Any wrap-around is handled
4605 : : by vec_perm_indices. */
4606 : 2144 : sel->new_vector (nelt, 1, 3);
4607 : 8576 : for (unsigned int i = 0; i < 3; i++)
4608 : 6432 : sel->quick_push (i + offset);
4609 : 2144 : }
4610 : :
4611 : : /* Checks whether the target supports whole-vector shifts for vectors of mode
4612 : : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4613 : : it supports vec_perm_const with masks for all necessary shift amounts. */
4614 : : static bool
4615 : 7513 : have_whole_vector_shift (machine_mode mode)
4616 : : {
4617 : 7513 : if (can_implement_p (vec_shr_optab, mode))
4618 : : return true;
4619 : :
4620 : : /* Variable-length vectors should be handled via the optab. */
4621 : 61 : unsigned int nelt;
4622 : 122 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4623 : : return false;
4624 : :
4625 : 61 : vec_perm_builder sel;
4626 : 61 : vec_perm_indices indices;
4627 : 307 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4628 : : {
4629 : 246 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
4630 : 246 : indices.new_vector (sel, 2, nelt);
4631 : 246 : if (!can_vec_perm_const_p (mode, mode, indices, false))
4632 : : return false;
4633 : : }
4634 : : return true;
4635 : 61 : }
4636 : :
4637 : : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4638 : : multiplication operands have differing signs and (b) we intend
4639 : : to emulate the operation using a series of signed DOT_PROD_EXPRs.
4640 : : See vect_emulate_mixed_dot_prod for the actual sequence used. */
4641 : :
4642 : : static bool
4643 : 2178 : vect_is_emulated_mixed_dot_prod (slp_tree slp_node)
4644 : : {
4645 : 2178 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
4646 : 2178 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4647 : 1725 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4648 : : return false;
4649 : :
4650 : 580 : tree rhs1 = gimple_assign_rhs1 (assign);
4651 : 580 : tree rhs2 = gimple_assign_rhs2 (assign);
4652 : 580 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4653 : : return false;
4654 : :
4655 : 435 : return !directly_supported_p (DOT_PROD_EXPR,
4656 : : SLP_TREE_VECTYPE (slp_node),
4657 : 145 : SLP_TREE_VECTYPE
4658 : : (SLP_TREE_CHILDREN (slp_node)[0]),
4659 : 145 : optab_vector_mixed_sign);
4660 : : }
4661 : :
4662 : : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4663 : : functions. Design better to avoid maintenance issues. */
4664 : :
4665 : : /* Function vect_model_reduction_cost.
4666 : :
4667 : : Models cost for a reduction operation, including the vector ops
4668 : : generated within the strip-mine loop in some cases, the initial
4669 : : definition before the loop, and the epilogue code that must be generated. */
4670 : :
4671 : : static void
4672 : 46244 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
4673 : : slp_tree node, internal_fn reduc_fn,
4674 : : vect_reduction_type reduction_type,
4675 : : int ncopies, stmt_vector_for_cost *cost_vec)
4676 : : {
4677 : 46244 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4678 : 46244 : tree vectype;
4679 : 46244 : machine_mode mode;
4680 : 46244 : class loop *loop = NULL;
4681 : :
4682 : 46244 : if (loop_vinfo)
4683 : 46244 : loop = LOOP_VINFO_LOOP (loop_vinfo);
4684 : :
4685 : : /* Condition reductions generate two reductions in the loop. */
4686 : 46244 : if (reduction_type == COND_REDUCTION)
4687 : 279 : ncopies *= 2;
4688 : :
4689 : 46244 : vectype = SLP_TREE_VECTYPE (node);
4690 : 46244 : mode = TYPE_MODE (vectype);
4691 : 46244 : stmt_vec_info orig_stmt_info
4692 : 46244 : = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
4693 : :
4694 : 46244 : gimple_match_op op;
4695 : 46244 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4696 : 0 : gcc_unreachable ();
4697 : :
4698 : 46244 : if (reduction_type == EXTRACT_LAST_REDUCTION)
4699 : : /* No extra instructions are needed in the prologue. The loop body
4700 : : operations are costed in vectorizable_condition. */
4701 : : inside_cost = 0;
4702 : 46244 : else if (reduction_type == FOLD_LEFT_REDUCTION)
4703 : : {
4704 : : /* No extra instructions needed in the prologue. */
4705 : 3927 : prologue_cost = 0;
4706 : :
4707 : 3927 : if (reduc_fn != IFN_LAST)
4708 : : /* Count one reduction-like operation per vector. */
4709 : 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4710 : : node, 0, vect_body);
4711 : : else
4712 : : {
4713 : : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4714 : 3927 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4715 : 3927 : inside_cost = record_stmt_cost (cost_vec, nelements,
4716 : : vec_to_scalar, node, 0,
4717 : : vect_body);
4718 : 3927 : inside_cost += record_stmt_cost (cost_vec, nelements,
4719 : : scalar_stmt, node, 0,
4720 : : vect_body);
4721 : : }
4722 : : }
4723 : : else
4724 : : {
4725 : : /* Add in the cost of the initial definitions. */
4726 : 42317 : int prologue_stmts;
4727 : 42317 : if (reduction_type == COND_REDUCTION)
4728 : : /* For cond reductions we have four vectors: initial index, step,
4729 : : initial result of the data reduction, initial value of the index
4730 : : reduction. */
4731 : : prologue_stmts = 4;
4732 : : else
4733 : : /* We need the initial reduction value. */
4734 : 42038 : prologue_stmts = 1;
4735 : 42317 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4736 : : scalar_to_vec, node, 0,
4737 : : vect_prologue);
4738 : : }
4739 : :
4740 : : /* Determine cost of epilogue code.
4741 : :
4742 : : We have a reduction operator that will reduce the vector in one statement.
4743 : : Also requires scalar extract. */
4744 : :
4745 : 46244 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4746 : : {
4747 : 46106 : if (reduc_fn != IFN_LAST)
4748 : : {
4749 : 34862 : if (reduction_type == COND_REDUCTION)
4750 : : {
4751 : : /* An EQ stmt and an COND_EXPR stmt. */
4752 : 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4753 : : vector_stmt, node, 0,
4754 : : vect_epilogue);
4755 : : /* Reduction of the max index and a reduction of the found
4756 : : values. */
4757 : 8 : epilogue_cost += record_stmt_cost (cost_vec, 2,
4758 : : vec_to_scalar, node, 0,
4759 : : vect_epilogue);
4760 : : /* A broadcast of the max value. */
4761 : 8 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4762 : : scalar_to_vec, node, 0,
4763 : : vect_epilogue);
4764 : : }
4765 : : else
4766 : : {
4767 : 34854 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4768 : : node, 0, vect_epilogue);
4769 : 34854 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4770 : : vec_to_scalar, node, 0,
4771 : : vect_epilogue);
4772 : : }
4773 : : }
4774 : 11244 : else if (reduction_type == COND_REDUCTION)
4775 : : {
4776 : 271 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4777 : : /* Extraction of scalar elements. */
4778 : 542 : epilogue_cost += record_stmt_cost (cost_vec,
4779 : 271 : 2 * estimated_nunits,
4780 : : vec_to_scalar, node, 0,
4781 : : vect_epilogue);
4782 : : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4783 : 271 : epilogue_cost += record_stmt_cost (cost_vec,
4784 : 271 : 2 * estimated_nunits - 3,
4785 : : scalar_stmt, node, 0,
4786 : : vect_epilogue);
4787 : : }
4788 : 10973 : else if (reduction_type == EXTRACT_LAST_REDUCTION
4789 : 10973 : || reduction_type == FOLD_LEFT_REDUCTION)
4790 : : /* No extra instructions need in the epilogue. */
4791 : : ;
4792 : : else
4793 : : {
4794 : 7046 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4795 : 7046 : tree bitsize = TYPE_SIZE (op.type);
4796 : 7046 : int element_bitsize = tree_to_uhwi (bitsize);
4797 : 7046 : int nelements = vec_size_in_bits / element_bitsize;
4798 : :
4799 : 7046 : if (op.code == COND_EXPR)
4800 : 28 : op.code = MAX_EXPR;
4801 : :
4802 : : /* We have a whole vector shift available. */
4803 : 968 : if (VECTOR_MODE_P (mode)
4804 : 7046 : && directly_supported_p (op.code, vectype)
4805 : 12713 : && have_whole_vector_shift (mode))
4806 : : {
4807 : : /* Final reduction via vector shifts and the reduction operator.
4808 : : Also requires scalar extract. */
4809 : 17001 : epilogue_cost += record_stmt_cost (cost_vec,
4810 : 11334 : exact_log2 (nelements) * 2,
4811 : : vector_stmt, node, 0,
4812 : : vect_epilogue);
4813 : 5667 : epilogue_cost += record_stmt_cost (cost_vec, 1,
4814 : : vec_to_scalar, node, 0,
4815 : : vect_epilogue);
4816 : : }
4817 : : else
4818 : : /* Use extracts and reduction op for final reduction. For N
4819 : : elements, we have N extracts and N-1 reduction ops. */
4820 : 1379 : epilogue_cost += record_stmt_cost (cost_vec,
4821 : 1379 : nelements + nelements - 1,
4822 : : vector_stmt, node, 0,
4823 : : vect_epilogue);
4824 : : }
4825 : : }
4826 : :
4827 : 46244 : if (dump_enabled_p ())
4828 : 2687 : dump_printf (MSG_NOTE,
4829 : : "vect_model_reduction_cost: inside_cost = %d, "
4830 : : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4831 : : prologue_cost, epilogue_cost);
4832 : 46244 : }
4833 : :
4834 : : /* SEQ is a sequence of instructions that initialize the reduction
4835 : : described by REDUC_INFO. Emit them in the appropriate place. */
4836 : :
4837 : : static void
4838 : 440 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4839 : : vect_reduc_info reduc_info, gimple *seq)
4840 : : {
4841 : 440 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
4842 : : {
4843 : : /* When reusing an accumulator from the main loop, we only need
4844 : : initialization instructions if the main loop can be skipped.
4845 : : In that case, emit the initialization instructions at the end
4846 : : of the guard block that does the skip. */
4847 : 25 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
4848 : 25 : gcc_assert (skip_edge);
4849 : 25 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4850 : 25 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4851 : : }
4852 : : else
4853 : : {
4854 : : /* The normal case: emit the initialization instructions on the
4855 : : preheader edge. */
4856 : 415 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4857 : 415 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4858 : : }
4859 : 440 : }
4860 : :
4861 : : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4862 : : which performs a reduction involving GROUP_SIZE scalar statements.
4863 : : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4864 : : is nonnull, introducing extra elements of that value will not change the
4865 : : result. */
4866 : :
4867 : : static void
4868 : 21661 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4869 : : vect_reduc_info reduc_info,
4870 : : tree vector_type,
4871 : : vec<tree> *vec_oprnds,
4872 : : unsigned int number_of_vectors,
4873 : : unsigned int group_size, tree neutral_op)
4874 : : {
4875 : 21661 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
4876 : 21661 : unsigned HOST_WIDE_INT nunits;
4877 : 21661 : unsigned j, number_of_places_left_in_vector;
4878 : 21661 : unsigned int i;
4879 : :
4880 : 43322 : gcc_assert (group_size == initial_values.length () || neutral_op);
4881 : :
4882 : : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4883 : : created vectors. It is greater than 1 if unrolling is performed.
4884 : :
4885 : : For example, we have two scalar operands, s1 and s2 (e.g., group of
4886 : : strided accesses of size two), while NUNITS is four (i.e., four scalars
4887 : : of this type can be packed in a vector). The output vector will contain
4888 : : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4889 : : will be 2).
4890 : :
4891 : : If GROUP_SIZE > NUNITS, the scalars will be split into several
4892 : : vectors containing the operands.
4893 : :
4894 : : For example, NUNITS is four as before, and the group size is 8
4895 : : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4896 : : {s5, s6, s7, s8}. */
4897 : :
4898 : 21661 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4899 : : nunits = group_size;
4900 : :
4901 : 21661 : tree vector_elt_type = TREE_TYPE (vector_type);
4902 : 21661 : number_of_places_left_in_vector = nunits;
4903 : 21661 : bool constant_p = true;
4904 : 21661 : tree_vector_builder elts (vector_type, nunits, 1);
4905 : 21661 : elts.quick_grow (nunits);
4906 : 21661 : gimple_seq ctor_seq = NULL;
4907 : 21661 : if (neutral_op
4908 : 43235 : && !useless_type_conversion_p (vector_elt_type,
4909 : 21574 : TREE_TYPE (neutral_op)))
4910 : 1 : neutral_op = gimple_convert (&ctor_seq, vector_elt_type, neutral_op);
4911 : 208769 : for (j = 0; j < nunits * number_of_vectors; ++j)
4912 : : {
4913 : 187108 : tree op;
4914 : 187108 : i = j % group_size;
4915 : :
4916 : : /* Get the def before the loop. In reduction chain we have only
4917 : : one initial value. Else we have as many as PHIs in the group. */
4918 : 187108 : if (i >= initial_values.length () || (j > i && neutral_op))
4919 : : op = neutral_op;
4920 : : else
4921 : : {
4922 : 44964 : if (!useless_type_conversion_p (vector_elt_type,
4923 : 22482 : TREE_TYPE (initial_values[i])))
4924 : : {
4925 : 140 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
4926 : 236 : initial_values[i] = gimple_build (&ctor_seq, COND_EXPR,
4927 : : vector_elt_type,
4928 : 118 : initial_values[i],
4929 : : build_all_ones_cst
4930 : : (vector_elt_type),
4931 : : build_zero_cst
4932 : : (vector_elt_type));
4933 : : else
4934 : 44 : initial_values[i] = gimple_convert (&ctor_seq,
4935 : : vector_elt_type,
4936 : 22 : initial_values[i]);
4937 : : }
4938 : 22482 : op = initial_values[i];
4939 : : }
4940 : :
4941 : : /* Create 'vect_ = {op0,op1,...,opn}'. */
4942 : 187108 : number_of_places_left_in_vector--;
4943 : 187108 : elts[nunits - number_of_places_left_in_vector - 1] = op;
4944 : 187108 : if (!CONSTANT_CLASS_P (op))
4945 : 2327 : constant_p = false;
4946 : :
4947 : 187108 : if (number_of_places_left_in_vector == 0)
4948 : : {
4949 : 23118 : tree init;
4950 : 46236 : if (constant_p && !neutral_op
4951 : 46178 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4952 : 23118 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4953 : : /* Build the vector directly from ELTS. */
4954 : 23118 : init = gimple_build_vector (&ctor_seq, &elts);
4955 : 0 : else if (neutral_op)
4956 : : {
4957 : : /* Build a vector of the neutral value and shift the
4958 : : other elements into place. */
4959 : 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4960 : : neutral_op);
4961 : 0 : int k = nunits;
4962 : 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
4963 : : k -= 1;
4964 : 0 : while (k > 0)
4965 : : {
4966 : 0 : k -= 1;
4967 : 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4968 : 0 : vector_type, init, elts[k]);
4969 : : }
4970 : : }
4971 : : else
4972 : : {
4973 : : /* First time round, duplicate ELTS to fill the
4974 : : required number of vectors. */
4975 : 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4976 : : elts, number_of_vectors, *vec_oprnds);
4977 : 0 : break;
4978 : : }
4979 : 23118 : vec_oprnds->quick_push (init);
4980 : :
4981 : 23118 : number_of_places_left_in_vector = nunits;
4982 : 23118 : elts.new_vector (vector_type, nunits, 1);
4983 : 23118 : elts.quick_grow (nunits);
4984 : 23118 : constant_p = true;
4985 : : }
4986 : : }
4987 : 21661 : if (ctor_seq != NULL)
4988 : 440 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4989 : 21661 : }
4990 : :
4991 : : vect_reduc_info
4992 : 131636 : info_for_reduction (loop_vec_info loop_vinfo, slp_tree node)
4993 : : {
4994 : 131636 : if (node->cycle_info.id == -1)
4995 : : return NULL;
4996 : 129846 : return loop_vinfo->reduc_infos[node->cycle_info.id];
4997 : : }
4998 : :
4999 : : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5000 : : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5001 : : return false. */
5002 : :
5003 : : static bool
5004 : 21297 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5005 : : vect_reduc_info reduc_info, tree vectype)
5006 : : {
5007 : 21297 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5008 : 21297 : if (!main_loop_vinfo)
5009 : : return false;
5010 : :
5011 : 4826 : if (VECT_REDUC_INFO_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5012 : : return false;
5013 : :
5014 : : /* We are not set up to handle vector bools when they are not mapped
5015 : : to vector integer data types. */
5016 : 4811 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5017 : 4881 : && GET_MODE_CLASS (TYPE_MODE (vectype)) != MODE_VECTOR_INT)
5018 : : return false;
5019 : :
5020 : 4809 : unsigned int num_phis = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).length ();
5021 : 4809 : auto_vec<tree, 16> main_loop_results (num_phis);
5022 : 4809 : auto_vec<tree, 16> initial_values (num_phis);
5023 : 4809 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5024 : : {
5025 : : /* The epilogue loop can be entered either from the main loop or
5026 : : from an earlier guard block. */
5027 : 4594 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5028 : 18400 : for (tree incoming_value : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info))
5029 : : {
5030 : : /* Look for:
5031 : :
5032 : : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5033 : : INITIAL_VALUE(guard block)>. */
5034 : 4618 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5035 : :
5036 : 4618 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5037 : 4618 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5038 : :
5039 : 4618 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5040 : 4618 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5041 : :
5042 : 4618 : main_loop_results.quick_push (from_main_loop);
5043 : 4618 : initial_values.quick_push (from_skip);
5044 : : }
5045 : : }
5046 : : else
5047 : : /* The main loop dominates the epilogue loop. */
5048 : 215 : main_loop_results.splice (VECT_REDUC_INFO_INITIAL_VALUES (reduc_info));
5049 : :
5050 : : /* See if the main loop has the kind of accumulator we need. */
5051 : 4809 : vect_reusable_accumulator *accumulator
5052 : 4809 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5053 : 4809 : if (!accumulator
5054 : 9602 : || num_phis != VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).length ()
5055 : 14407 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5056 : : VECT_REDUC_INFO_SCALAR_RESULTS (accumulator->reduc_info).begin ()))
5057 : : return false;
5058 : :
5059 : : /* Handle the case where we can reduce wider vectors to narrower ones. */
5060 : 4799 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5061 : 4799 : unsigned HOST_WIDE_INT m;
5062 : 4799 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5063 : 4799 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5064 : 0 : return false;
5065 : : /* Check the intermediate vector types and operations are available. */
5066 : 4799 : tree prev_vectype = old_vectype;
5067 : 4799 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5068 : 13874 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5069 : : {
5070 : 4798 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5071 : 4798 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5072 : 4798 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5073 : 4798 : if (!intermediate_vectype
5074 : 4798 : || !directly_supported_p (VECT_REDUC_INFO_CODE (reduc_info),
5075 : : intermediate_vectype)
5076 : 9076 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5077 : 4278 : TYPE_MODE (intermediate_vectype)))
5078 : : return false;
5079 : : prev_vectype = intermediate_vectype;
5080 : : }
5081 : :
5082 : : /* Non-SLP reductions might apply an adjustment after the reduction
5083 : : operation, in order to simplify the initialization of the accumulator.
5084 : : If the epilogue loop carries on from where the main loop left off,
5085 : : it should apply the same adjustment to the final reduction result.
5086 : :
5087 : : If the epilogue loop can also be entered directly (rather than via
5088 : : the main loop), we need to be able to handle that case in the same way,
5089 : : with the same adjustment. (In principle we could add a PHI node
5090 : : to select the correct adjustment, but in practice that shouldn't be
5091 : : necessary.) */
5092 : 4277 : tree main_adjustment
5093 : 4277 : = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5094 : 4277 : if (loop_vinfo->main_loop_edge && main_adjustment)
5095 : : {
5096 : 3638 : gcc_assert (num_phis == 1);
5097 : 3638 : tree initial_value = initial_values[0];
5098 : : /* Check that we can use INITIAL_VALUE as the adjustment and
5099 : : initialize the accumulator with a neutral value instead. */
5100 : 3638 : if (!operand_equal_p (initial_value, main_adjustment))
5101 : 106 : return false;
5102 : 3532 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5103 : 3532 : initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5104 : : code, initial_value);
5105 : : }
5106 : 4171 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5107 : 4171 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).truncate (0);
5108 : 4171 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).splice (initial_values);
5109 : 4171 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info) = accumulator;
5110 : 4171 : return true;
5111 : 4809 : }
5112 : :
5113 : : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5114 : : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5115 : :
5116 : : static tree
5117 : 4215 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5118 : : gimple_seq *seq)
5119 : : {
5120 : 4215 : gcc_assert (!VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (vec_def))
5121 : : || (GET_MODE_CLASS (TYPE_MODE (TREE_TYPE (vec_def)))
5122 : : == MODE_VECTOR_INT));
5123 : 4215 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5124 : 4215 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5125 : 4215 : tree stype = TREE_TYPE (vectype);
5126 : 4215 : tree new_temp = vec_def;
5127 : 8422 : while (nunits > nunits1)
5128 : : {
5129 : 4207 : nunits /= 2;
5130 : 4207 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5131 : 4207 : stype, nunits);
5132 : 4207 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5133 : :
5134 : : /* The target has to make sure we support lowpart/highpart
5135 : : extraction, either via direct vector extract or through
5136 : : an integer mode punning. */
5137 : 4207 : tree dst1, dst2;
5138 : 4207 : gimple *epilog_stmt;
5139 : 4207 : if (convert_optab_handler (vec_extract_optab,
5140 : 4207 : TYPE_MODE (TREE_TYPE (new_temp)),
5141 : 4207 : TYPE_MODE (vectype1))
5142 : : != CODE_FOR_nothing)
5143 : : {
5144 : : /* Extract sub-vectors directly once vec_extract becomes
5145 : : a conversion optab. */
5146 : 2684 : dst1 = make_ssa_name (vectype1);
5147 : 2684 : epilog_stmt
5148 : 5368 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5149 : : build3 (BIT_FIELD_REF, vectype1,
5150 : 2684 : new_temp, TYPE_SIZE (vectype1),
5151 : : bitsize_int (0)));
5152 : 2684 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5153 : 2684 : dst2 = make_ssa_name (vectype1);
5154 : 2684 : epilog_stmt
5155 : 2684 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5156 : : build3 (BIT_FIELD_REF, vectype1,
5157 : 2684 : new_temp, TYPE_SIZE (vectype1),
5158 : 2684 : bitsize_int (bitsize)));
5159 : 2684 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5160 : : }
5161 : : else
5162 : : {
5163 : : /* Extract via punning to appropriately sized integer mode
5164 : : vector. */
5165 : 1523 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5166 : 1523 : tree etype = build_vector_type (eltype, 2);
5167 : 3046 : gcc_assert (convert_optab_handler (vec_extract_optab,
5168 : : TYPE_MODE (etype),
5169 : : TYPE_MODE (eltype))
5170 : : != CODE_FOR_nothing);
5171 : 1523 : tree tem = make_ssa_name (etype);
5172 : 1523 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5173 : : build1 (VIEW_CONVERT_EXPR,
5174 : : etype, new_temp));
5175 : 1523 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5176 : 1523 : new_temp = tem;
5177 : 1523 : tem = make_ssa_name (eltype);
5178 : 1523 : epilog_stmt
5179 : 3046 : = gimple_build_assign (tem, BIT_FIELD_REF,
5180 : : build3 (BIT_FIELD_REF, eltype,
5181 : 1523 : new_temp, TYPE_SIZE (eltype),
5182 : : bitsize_int (0)));
5183 : 1523 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5184 : 1523 : dst1 = make_ssa_name (vectype1);
5185 : 1523 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5186 : : build1 (VIEW_CONVERT_EXPR,
5187 : : vectype1, tem));
5188 : 1523 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5189 : 1523 : tem = make_ssa_name (eltype);
5190 : 1523 : epilog_stmt
5191 : 1523 : = gimple_build_assign (tem, BIT_FIELD_REF,
5192 : : build3 (BIT_FIELD_REF, eltype,
5193 : 1523 : new_temp, TYPE_SIZE (eltype),
5194 : 1523 : bitsize_int (bitsize)));
5195 : 1523 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5196 : 1523 : dst2 = make_ssa_name (vectype1);
5197 : 1523 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5198 : : build1 (VIEW_CONVERT_EXPR,
5199 : : vectype1, tem));
5200 : 1523 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5201 : : }
5202 : :
5203 : 4207 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5204 : : }
5205 : 4215 : if (!useless_type_conversion_p (vectype, TREE_TYPE (new_temp)))
5206 : : {
5207 : 66 : tree dst3 = make_ssa_name (vectype);
5208 : 66 : gimple *epilog_stmt = gimple_build_assign (dst3, VIEW_CONVERT_EXPR,
5209 : : build1 (VIEW_CONVERT_EXPR,
5210 : : vectype, new_temp));
5211 : 66 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5212 : 66 : new_temp = dst3;
5213 : : }
5214 : :
5215 : 4215 : return new_temp;
5216 : : }
5217 : :
5218 : : /* Function vect_create_epilog_for_reduction
5219 : :
5220 : : Create code at the loop-epilog to finalize the result of a reduction
5221 : : computation.
5222 : :
5223 : : STMT_INFO is the scalar reduction stmt that is being vectorized.
5224 : : SLP_NODE is an SLP node containing a group of reduction statements. The
5225 : : first one in this group is STMT_INFO.
5226 : : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5227 : : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5228 : : (counting from 0)
5229 : : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5230 : : exit this edge is always the main loop exit.
5231 : :
5232 : : This function:
5233 : : 1. Completes the reduction def-use cycles.
5234 : : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5235 : : by calling the function specified by REDUC_FN if available, or by
5236 : : other means (whole-vector shifts or a scalar loop).
5237 : : The function also creates a new phi node at the loop exit to preserve
5238 : : loop-closed form, as illustrated below.
5239 : :
5240 : : The flow at the entry to this function:
5241 : :
5242 : : loop:
5243 : : vec_def = phi <vec_init, null> # REDUCTION_PHI
5244 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5245 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5246 : : loop_exit:
5247 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5248 : : use <s_out0>
5249 : : use <s_out0>
5250 : :
5251 : : The above is transformed by this function into:
5252 : :
5253 : : loop:
5254 : : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5255 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5256 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5257 : : loop_exit:
5258 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5259 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5260 : : v_out2 = reduce <v_out1>
5261 : : s_out3 = extract_field <v_out2, 0>
5262 : : s_out4 = adjust_result <s_out3>
5263 : : use <s_out4>
5264 : : use <s_out4>
5265 : : */
5266 : :
5267 : : static void
5268 : 22000 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5269 : : stmt_vec_info stmt_info,
5270 : : slp_tree slp_node,
5271 : : slp_instance slp_node_instance,
5272 : : edge loop_exit)
5273 : : {
5274 : 22000 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
5275 : 22000 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
5276 : 22000 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
5277 : 22000 : tree vectype;
5278 : 22000 : machine_mode mode;
5279 : 22000 : basic_block exit_bb;
5280 : 22000 : gimple *new_phi = NULL, *phi = NULL;
5281 : 22000 : gimple_stmt_iterator exit_gsi;
5282 : 22000 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5283 : 22000 : gimple *epilog_stmt = NULL;
5284 : 22000 : gimple *exit_phi;
5285 : 22000 : tree def;
5286 : 22000 : tree orig_name, scalar_result;
5287 : 22000 : imm_use_iterator imm_iter;
5288 : 22000 : use_operand_p use_p;
5289 : 22000 : gimple *use_stmt;
5290 : 22000 : auto_vec<tree> reduc_inputs;
5291 : 22000 : int j, i;
5292 : 22000 : vec<tree> &scalar_results = VECT_REDUC_INFO_SCALAR_RESULTS (reduc_info);
5293 : 22000 : unsigned int k;
5294 : : /* SLP reduction without reduction chain, e.g.,
5295 : : # a1 = phi <a2, a0>
5296 : : # b1 = phi <b2, b0>
5297 : : a2 = operation (a1)
5298 : : b2 = operation (b1) */
5299 : 22000 : const bool slp_reduc = !reduc_info->is_reduc_chain;
5300 : 22000 : tree induction_index = NULL_TREE;
5301 : :
5302 : 22000 : unsigned int group_size = SLP_TREE_LANES (slp_node);
5303 : :
5304 : 22000 : bool double_reduc = false;
5305 : 22000 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5306 : 22000 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5307 : : {
5308 : 0 : double_reduc = true;
5309 : 0 : gcc_assert (slp_reduc);
5310 : : }
5311 : :
5312 : 22000 : vectype = VECT_REDUC_INFO_VECTYPE (reduc_info);
5313 : 22000 : gcc_assert (vectype);
5314 : 22000 : mode = TYPE_MODE (vectype);
5315 : :
5316 : 22000 : tree induc_val = NULL_TREE;
5317 : 22000 : tree adjustment_def = NULL;
5318 : : /* Optimize: for induction condition reduction, if we can't use zero
5319 : : for induc_val, use initial_def. */
5320 : 22000 : if (VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5321 : 62 : induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
5322 : 21938 : else if (double_reduc)
5323 : : ;
5324 : : else
5325 : 21938 : adjustment_def = VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info);
5326 : :
5327 : 22000 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
5328 : 22000 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5329 : 22000 : if (slp_reduc)
5330 : : /* All statements produce live-out values. */
5331 : 43610 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5332 : :
5333 : 22000 : unsigned vec_num
5334 : 22000 : = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5335 : :
5336 : : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5337 : : which is updated with the current index of the loop for every match of
5338 : : the original loop's cond_expr (VEC_STMT). This results in a vector
5339 : : containing the last time the condition passed for that vector lane.
5340 : : The first match will be a 1 to allow 0 to be used for non-matching
5341 : : indexes. If there are no matches at all then the vector will be all
5342 : : zeroes.
5343 : :
5344 : : PR92772: This algorithm is broken for architectures that support
5345 : : masked vectors, but do not provide fold_extract_last. */
5346 : 22000 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION)
5347 : : {
5348 : 67 : gcc_assert (!double_reduc);
5349 : 67 : auto_vec<std::pair<tree, bool>, 2> ccompares;
5350 : 67 : slp_tree cond_node = slp_node_instance->root;
5351 : 143 : while (cond_node != slp_node_instance->reduc_phis)
5352 : : {
5353 : 76 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
5354 : 76 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5355 : : {
5356 : 76 : gimple *vec_stmt
5357 : 76 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
5358 : 76 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5359 : 76 : ccompares.safe_push
5360 : 76 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
5361 : 76 : SLP_TREE_REDUC_IDX (cond_node) == 2));
5362 : : }
5363 : 76 : int slp_reduc_idx = SLP_TREE_REDUC_IDX (cond_node);
5364 : 76 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
5365 : : }
5366 : 67 : gcc_assert (ccompares.length () != 0);
5367 : :
5368 : 67 : tree indx_before_incr, indx_after_incr;
5369 : 67 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5370 : 67 : int scalar_precision
5371 : 67 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5372 : 67 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5373 : 67 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
5374 : 67 : (TYPE_MODE (vectype), cr_index_scalar_type,
5375 : : TYPE_VECTOR_SUBPARTS (vectype));
5376 : :
5377 : : /* First we create a simple vector induction variable which starts
5378 : : with the values {1,2,3,...} (SERIES_VECT) and increments by the
5379 : : vector size (STEP). */
5380 : :
5381 : : /* Create a {1,2,3,...} vector. */
5382 : 67 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5383 : :
5384 : : /* Create a vector of the step value. */
5385 : 67 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5386 : 67 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5387 : :
5388 : : /* Create an induction variable. */
5389 : 67 : gimple_stmt_iterator incr_gsi;
5390 : 67 : bool insert_after;
5391 : 67 : vect_iv_increment_position (LOOP_VINFO_IV_EXIT (loop_vinfo),
5392 : : &incr_gsi, &insert_after);
5393 : 67 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5394 : : insert_after, &indx_before_incr, &indx_after_incr);
5395 : :
5396 : : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5397 : : filled with zeros (VEC_ZERO). */
5398 : :
5399 : : /* Create a vector of 0s. */
5400 : 67 : tree zero = build_zero_cst (cr_index_scalar_type);
5401 : 67 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5402 : :
5403 : : /* Create a vector phi node. */
5404 : 67 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5405 : 67 : new_phi = create_phi_node (new_phi_tree, loop->header);
5406 : 67 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5407 : : loop_preheader_edge (loop), UNKNOWN_LOCATION);
5408 : :
5409 : : /* Now take the condition from the loops original cond_exprs
5410 : : and produce a new cond_exprs (INDEX_COND_EXPR) which for
5411 : : every match uses values from the induction variable
5412 : : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5413 : : (NEW_PHI_TREE).
5414 : : Finally, we update the phi (NEW_PHI_TREE) to take the value of
5415 : : the new cond_expr (INDEX_COND_EXPR). */
5416 : 67 : gimple_seq stmts = NULL;
5417 : 210 : for (int i = ccompares.length () - 1; i != -1; --i)
5418 : : {
5419 : 76 : tree ccompare = ccompares[i].first;
5420 : 76 : if (ccompares[i].second)
5421 : 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5422 : : cr_index_vector_type,
5423 : : ccompare,
5424 : : indx_before_incr, new_phi_tree);
5425 : : else
5426 : 7 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5427 : : cr_index_vector_type,
5428 : : ccompare,
5429 : : new_phi_tree, indx_before_incr);
5430 : : }
5431 : 67 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5432 : :
5433 : : /* Update the phi with the vec cond. */
5434 : 67 : induction_index = new_phi_tree;
5435 : 67 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5436 : : loop_latch_edge (loop), UNKNOWN_LOCATION);
5437 : 67 : }
5438 : :
5439 : : /* 2. Create epilog code.
5440 : : The reduction epilog code operates across the elements of the vector
5441 : : of partial results computed by the vectorized loop.
5442 : : The reduction epilog code consists of:
5443 : :
5444 : : step 1: compute the scalar result in a vector (v_out2)
5445 : : step 2: extract the scalar result (s_out3) from the vector (v_out2)
5446 : : step 3: adjust the scalar result (s_out3) if needed.
5447 : :
5448 : : Step 1 can be accomplished using one the following three schemes:
5449 : : (scheme 1) using reduc_fn, if available.
5450 : : (scheme 2) using whole-vector shifts, if available.
5451 : : (scheme 3) using a scalar loop. In this case steps 1+2 above are
5452 : : combined.
5453 : :
5454 : : The overall epilog code looks like this:
5455 : :
5456 : : s_out0 = phi <s_loop> # original EXIT_PHI
5457 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5458 : : v_out2 = reduce <v_out1> # step 1
5459 : : s_out3 = extract_field <v_out2, 0> # step 2
5460 : : s_out4 = adjust_result <s_out3> # step 3
5461 : :
5462 : : (step 3 is optional, and steps 1 and 2 may be combined).
5463 : : Lastly, the uses of s_out0 are replaced by s_out4. */
5464 : :
5465 : :
5466 : : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5467 : : v_out1 = phi <VECT_DEF>
5468 : : Store them in NEW_PHIS. */
5469 : : /* We need to reduce values in all exits. */
5470 : 22000 : exit_bb = loop_exit->dest;
5471 : 22000 : exit_gsi = gsi_after_labels (exit_bb);
5472 : 22000 : reduc_inputs.create (vec_num);
5473 : 45467 : for (unsigned i = 0; i < vec_num; i++)
5474 : : {
5475 : 23467 : gimple_seq stmts = NULL;
5476 : 23467 : def = vect_get_slp_vect_def (slp_node, i);
5477 : 23467 : tree new_def = copy_ssa_name (def);
5478 : 23467 : phi = create_phi_node (new_def, exit_bb);
5479 : 23467 : if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
5480 : 23440 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
5481 : : else
5482 : : {
5483 : 57 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
5484 : 30 : SET_PHI_ARG_DEF (phi, k, def);
5485 : : }
5486 : 23467 : new_def = gimple_convert (&stmts, vectype, new_def);
5487 : 23467 : reduc_inputs.quick_push (new_def);
5488 : 23467 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5489 : : }
5490 : :
5491 : : /* 2.2 Get the original scalar reduction variable as defined in the loop.
5492 : : In case STMT is a "pattern-stmt" (i.e. - it represents a reduction
5493 : : pattern), the scalar-def is taken from the original stmt that the
5494 : : pattern-stmt (STMT) replaces. */
5495 : :
5496 : 22817 : tree scalar_dest = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5497 : 22000 : tree scalar_type = TREE_TYPE (scalar_dest);
5498 : 22000 : scalar_results.truncate (0);
5499 : 22000 : scalar_results.reserve_exact (group_size);
5500 : 22000 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5501 : :
5502 : : /* True if we should implement SLP_REDUC using native reduction operations
5503 : : instead of scalar operations. */
5504 : 22000 : const bool direct_slp_reduc
5505 : 22000 : = (reduc_fn != IFN_LAST
5506 : 22000 : && slp_reduc
5507 : 22000 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5508 : :
5509 : : /* If signed overflow is undefined we might need to perform reduction
5510 : : computations in an unsigned type. */
5511 : 22000 : tree compute_vectype = vectype;
5512 : 22000 : if (ANY_INTEGRAL_TYPE_P (vectype)
5513 : 14997 : && TYPE_OVERFLOW_UNDEFINED (vectype)
5514 : 5439 : && code.is_tree_code ()
5515 : 27439 : && arith_code_with_undefined_signed_overflow ((tree_code) code))
5516 : 3988 : compute_vectype = unsigned_type_for (vectype);
5517 : :
5518 : : /* In case of reduction chain, e.g.,
5519 : : # a1 = phi <a3, a0>
5520 : : a2 = operation (a1)
5521 : : a3 = operation (a2),
5522 : :
5523 : : we may end up with more than one vector result. Here we reduce them
5524 : : to one vector.
5525 : :
5526 : : The same is true for a SLP reduction, e.g.,
5527 : : # a1 = phi <a2, a0>
5528 : : # b1 = phi <b2, b0>
5529 : : a2 = operation (a1)
5530 : : b2 = operation (a2),
5531 : :
5532 : : where we can end up with more than one vector as well. We can
5533 : : easily accumulate vectors when the number of vector elements is
5534 : : a multiple of the SLP group size.
5535 : :
5536 : : The same is true if we couldn't use a single defuse cycle. */
5537 : 22000 : if ((!slp_reduc
5538 : : || direct_slp_reduc
5539 : : || (slp_reduc
5540 : 22000 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
5541 : 44000 : && reduc_inputs.length () > 1)
5542 : : {
5543 : 537 : gimple_seq stmts = NULL;
5544 : 537 : tree single_input = reduc_inputs[0];
5545 : 537 : if (compute_vectype != vectype)
5546 : 152 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5547 : : compute_vectype, single_input);
5548 : 1839 : for (k = 1; k < reduc_inputs.length (); k++)
5549 : : {
5550 : 1302 : tree input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5551 : 1302 : compute_vectype, reduc_inputs[k]);
5552 : 1302 : single_input = gimple_build (&stmts, code, compute_vectype,
5553 : : single_input, input);
5554 : : }
5555 : 537 : if (compute_vectype != vectype)
5556 : 152 : single_input = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5557 : : vectype, single_input);
5558 : 537 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5559 : :
5560 : 537 : reduc_inputs.truncate (0);
5561 : 537 : reduc_inputs.safe_push (single_input);
5562 : : }
5563 : :
5564 : 22000 : tree orig_reduc_input = reduc_inputs[0];
5565 : :
5566 : : /* If this loop is an epilogue loop that can be skipped after the
5567 : : main loop, we can only share a reduction operation between the
5568 : : main loop and the epilogue if we put it at the target of the
5569 : : skip edge.
5570 : :
5571 : : We can still reuse accumulators if this check fails. Doing so has
5572 : : the minor(?) benefit of making the epilogue loop's scalar result
5573 : : independent of the main loop's scalar result. */
5574 : 22000 : bool unify_with_main_loop_p = false;
5575 : 22000 : if (VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
5576 : 4171 : && loop_vinfo->skip_this_loop_edge
5577 : 3939 : && single_succ_p (exit_bb)
5578 : 22021 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5579 : : {
5580 : 21 : unify_with_main_loop_p = true;
5581 : :
5582 : 21 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5583 : 21 : reduc_inputs[0] = make_ssa_name (vectype);
5584 : 21 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5585 : 21 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5586 : : UNKNOWN_LOCATION);
5587 : 21 : add_phi_arg (new_phi,
5588 : 21 : VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)->reduc_input,
5589 : : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5590 : 21 : exit_gsi = gsi_after_labels (reduc_block);
5591 : : }
5592 : :
5593 : : /* Shouldn't be used beyond this point. */
5594 : 22000 : exit_bb = nullptr;
5595 : :
5596 : : /* If we are operating on a mask vector and do not support direct mask
5597 : : reduction, work on a bool data vector instead of a mask vector. */
5598 : 22000 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
5599 : 227 : && VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info)
5600 : 22192 : && vectype != VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info))
5601 : : {
5602 : 192 : compute_vectype = vectype = VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info);
5603 : 192 : gimple_seq stmts = NULL;
5604 : 392 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5605 : 400 : reduc_inputs[i] = gimple_build (&stmts, VEC_COND_EXPR, vectype,
5606 : 200 : reduc_inputs[i],
5607 : : build_one_cst (vectype),
5608 : : build_zero_cst (vectype));
5609 : 192 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5610 : : }
5611 : :
5612 : 22000 : if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5613 : 67 : && reduc_fn != IFN_LAST)
5614 : : {
5615 : : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5616 : : various data values where the condition matched and another vector
5617 : : (INDUCTION_INDEX) containing all the indexes of those matches. We
5618 : : need to extract the last matching index (which will be the index with
5619 : : highest value) and use this to index into the data vector.
5620 : : For the case where there were no matches, the data vector will contain
5621 : : all default values and the index vector will be all zeros. */
5622 : :
5623 : : /* Get various versions of the type of the vector of indexes. */
5624 : 4 : tree index_vec_type = TREE_TYPE (induction_index);
5625 : 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5626 : 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
5627 : 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
5628 : :
5629 : : /* Get an unsigned integer version of the type of the data vector. */
5630 : 4 : int scalar_precision
5631 : 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5632 : 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5633 : 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5634 : : vectype);
5635 : :
5636 : : /* First we need to create a vector (ZERO_VEC) of zeros and another
5637 : : vector (MAX_INDEX_VEC) filled with the last matching index, which we
5638 : : can create using a MAX reduction and then expanding.
5639 : : In the case where the loop never made any matches, the max index will
5640 : : be zero. */
5641 : :
5642 : : /* Vector of {0, 0, 0,...}. */
5643 : 4 : tree zero_vec = build_zero_cst (vectype);
5644 : :
5645 : : /* Find maximum value from the vector of found indexes. */
5646 : 4 : tree max_index = make_ssa_name (index_scalar_type);
5647 : 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5648 : : 1, induction_index);
5649 : 4 : gimple_call_set_lhs (max_index_stmt, max_index);
5650 : 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5651 : :
5652 : : /* Vector of {max_index, max_index, max_index,...}. */
5653 : 4 : tree max_index_vec = make_ssa_name (index_vec_type);
5654 : 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5655 : : max_index);
5656 : 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5657 : : max_index_vec_rhs);
5658 : 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5659 : :
5660 : : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5661 : : with the vector (INDUCTION_INDEX) of found indexes, choosing values
5662 : : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5663 : : otherwise. Only one value should match, resulting in a vector
5664 : : (VEC_COND) with one data value and the rest zeros.
5665 : : In the case where the loop never made any matches, every index will
5666 : : match, resulting in a vector with all data values (which will all be
5667 : : the default value). */
5668 : :
5669 : : /* Compare the max index vector to the vector of found indexes to find
5670 : : the position of the max value. */
5671 : 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
5672 : 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5673 : : induction_index,
5674 : : max_index_vec);
5675 : 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5676 : :
5677 : : /* Use the compare to choose either values from the data vector or
5678 : : zero. */
5679 : 4 : tree vec_cond = make_ssa_name (vectype);
5680 : 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5681 : : vec_compare,
5682 : 4 : reduc_inputs[0],
5683 : : zero_vec);
5684 : 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5685 : :
5686 : : /* Finally we need to extract the data value from the vector (VEC_COND)
5687 : : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5688 : : reduction, but because this doesn't exist, we can use a MAX reduction
5689 : : instead. The data value might be signed or a float so we need to cast
5690 : : it first.
5691 : : In the case where the loop never made any matches, the data values are
5692 : : all identical, and so will reduce down correctly. */
5693 : :
5694 : : /* Make the matched data values unsigned. */
5695 : 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5696 : 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5697 : : vec_cond);
5698 : 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5699 : : VIEW_CONVERT_EXPR,
5700 : : vec_cond_cast_rhs);
5701 : 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5702 : :
5703 : : /* Reduce down to a scalar value. */
5704 : 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
5705 : 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5706 : : 1, vec_cond_cast);
5707 : 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5708 : 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5709 : :
5710 : : /* Convert the reduced value back to the result type and set as the
5711 : : result. */
5712 : 4 : gimple_seq stmts = NULL;
5713 : 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5714 : : data_reduc);
5715 : 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5716 : 4 : scalar_results.safe_push (new_temp);
5717 : 4 : }
5718 : 21996 : else if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
5719 : 63 : && reduc_fn == IFN_LAST)
5720 : : {
5721 : : /* Condition reduction without supported IFN_REDUC_MAX. Generate
5722 : : idx = 0;
5723 : : idx_val = induction_index[0];
5724 : : val = data_reduc[0];
5725 : : for (idx = 0, val = init, i = 0; i < nelts; ++i)
5726 : : if (induction_index[i] > idx_val)
5727 : : val = data_reduc[i], idx_val = induction_index[i];
5728 : : return val; */
5729 : :
5730 : 63 : tree data_eltype = TREE_TYPE (vectype);
5731 : 63 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5732 : 63 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5733 : 63 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5734 : : /* Enforced by vectorizable_reduction, which ensures we have target
5735 : : support before allowing a conditional reduction on variable-length
5736 : : vectors. */
5737 : 63 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5738 : 63 : tree idx_val = NULL_TREE, val = NULL_TREE;
5739 : 419 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5740 : : {
5741 : 356 : tree old_idx_val = idx_val;
5742 : 356 : tree old_val = val;
5743 : 356 : idx_val = make_ssa_name (idx_eltype);
5744 : 356 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5745 : : build3 (BIT_FIELD_REF, idx_eltype,
5746 : : induction_index,
5747 : 356 : bitsize_int (el_size),
5748 : 356 : bitsize_int (off)));
5749 : 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5750 : 356 : val = make_ssa_name (data_eltype);
5751 : 712 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5752 : : build3 (BIT_FIELD_REF,
5753 : : data_eltype,
5754 : 356 : reduc_inputs[0],
5755 : 356 : bitsize_int (el_size),
5756 : 356 : bitsize_int (off)));
5757 : 356 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5758 : 356 : if (off != 0)
5759 : : {
5760 : 293 : tree new_idx_val = idx_val;
5761 : 293 : if (off != v_size - el_size)
5762 : : {
5763 : 230 : new_idx_val = make_ssa_name (idx_eltype);
5764 : 230 : epilog_stmt = gimple_build_assign (new_idx_val,
5765 : : MAX_EXPR, idx_val,
5766 : : old_idx_val);
5767 : 230 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5768 : : }
5769 : 293 : tree cond = make_ssa_name (boolean_type_node);
5770 : 293 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5771 : : idx_val, old_idx_val);
5772 : 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5773 : 293 : tree new_val = make_ssa_name (data_eltype);
5774 : 293 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5775 : : cond, val, old_val);
5776 : 293 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5777 : 293 : idx_val = new_idx_val;
5778 : 293 : val = new_val;
5779 : : }
5780 : : }
5781 : : /* Convert the reduced value back to the result type and set as the
5782 : : result. */
5783 : 63 : gimple_seq stmts = NULL;
5784 : 63 : val = gimple_convert (&stmts, scalar_type, val);
5785 : 63 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5786 : 63 : scalar_results.safe_push (val);
5787 : 63 : }
5788 : :
5789 : : /* 2.3 Create the reduction code, using one of the three schemes described
5790 : : above. In SLP we simply need to extract all the elements from the
5791 : : vector (without reducing them), so we use scalar shifts. */
5792 : 21933 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
5793 : : {
5794 : 20087 : tree tmp;
5795 : 20087 : tree vec_elem_type;
5796 : :
5797 : : /* Case 1: Create:
5798 : : v_out2 = reduc_expr <v_out1> */
5799 : :
5800 : 20087 : if (dump_enabled_p ())
5801 : 1408 : dump_printf_loc (MSG_NOTE, vect_location,
5802 : : "Reduce using direct vector reduction.\n");
5803 : :
5804 : 20087 : gimple_seq stmts = NULL;
5805 : 20087 : vec_elem_type = TREE_TYPE (vectype);
5806 : 20087 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5807 : 20087 : vec_elem_type, reduc_inputs[0]);
5808 : 20087 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5809 : 20087 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5810 : :
5811 : 20087 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5812 : 62 : && induc_val)
5813 : : {
5814 : : /* Earlier we set the initial value to be a vector if induc_val
5815 : : values. Check the result and if it is induc_val then replace
5816 : : with the original initial value, unless induc_val is
5817 : : the same as initial_def already. */
5818 : 60 : tree zcompare = make_ssa_name (boolean_type_node);
5819 : 60 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5820 : : new_temp, induc_val);
5821 : 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5822 : 60 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
5823 : 60 : tmp = make_ssa_name (new_scalar_dest);
5824 : 60 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5825 : : initial_def, new_temp);
5826 : 60 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5827 : 60 : new_temp = tmp;
5828 : : }
5829 : :
5830 : 20087 : scalar_results.safe_push (new_temp);
5831 : 20087 : }
5832 : 1655 : else if (direct_slp_reduc)
5833 : : {
5834 : : /* Here we create one vector for each of the GROUP_SIZE results,
5835 : : with the elements for other SLP statements replaced with the
5836 : : neutral value. We can then do a normal reduction on each vector. */
5837 : :
5838 : : /* Enforced by vectorizable_reduction. */
5839 : : gcc_assert (reduc_inputs.length () == 1);
5840 : : gcc_assert (pow2p_hwi (group_size));
5841 : :
5842 : : gimple_seq seq = NULL;
5843 : :
5844 : : /* Build a vector {0, 1, 2, ...}, with the same number of elements
5845 : : and the same element size as VECTYPE. */
5846 : : tree index = build_index_vector (vectype, 0, 1);
5847 : : tree index_type = TREE_TYPE (index);
5848 : : tree index_elt_type = TREE_TYPE (index_type);
5849 : : tree mask_type = truth_type_for (index_type);
5850 : :
5851 : : /* Create a vector that, for each element, identifies which of
5852 : : the results should use it. */
5853 : : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5854 : : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5855 : : build_vector_from_val (index_type, index_mask));
5856 : :
5857 : : /* Get a neutral vector value. This is simply a splat of the neutral
5858 : : scalar value if we have one, otherwise the initial scalar value
5859 : : is itself a neutral value. */
5860 : : tree vector_identity = NULL_TREE;
5861 : : tree neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5862 : : NULL_TREE, false);
5863 : : if (neutral_op)
5864 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5865 : : neutral_op);
5866 : : for (unsigned int i = 0; i < group_size; ++i)
5867 : : {
5868 : : /* If there's no univeral neutral value, we can use the
5869 : : initial scalar value from the original PHI. This is used
5870 : : for MIN and MAX reduction, for example. */
5871 : : if (!neutral_op)
5872 : : {
5873 : : tree scalar_value
5874 : : = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[i];
5875 : : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5876 : : scalar_value);
5877 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
5878 : : scalar_value);
5879 : : }
5880 : :
5881 : : /* Calculate the equivalent of:
5882 : :
5883 : : sel[j] = (index[j] == i);
5884 : :
5885 : : which selects the elements of REDUC_INPUTS[0] that should
5886 : : be included in the result. */
5887 : : tree compare_val = build_int_cst (index_elt_type, i);
5888 : : compare_val = build_vector_from_val (index_type, compare_val);
5889 : : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5890 : : index, compare_val);
5891 : :
5892 : : /* Calculate the equivalent of:
5893 : :
5894 : : vec = seq ? reduc_inputs[0] : vector_identity;
5895 : :
5896 : : VEC is now suitable for a full vector reduction. */
5897 : : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5898 : : sel, reduc_inputs[0], vector_identity);
5899 : :
5900 : : /* Do the reduction and convert it to the appropriate type. */
5901 : : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5902 : : TREE_TYPE (vectype), vec);
5903 : : scalar = gimple_convert (&seq, scalar_type, scalar);
5904 : : scalar_results.safe_push (scalar);
5905 : : }
5906 : : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5907 : : }
5908 : : else
5909 : : {
5910 : 1655 : bool reduce_with_shift;
5911 : 1655 : tree vec_temp;
5912 : :
5913 : 1655 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5914 : :
5915 : : /* See if the target wants to do the final (shift) reduction
5916 : : in a vector mode of smaller size and first reduce upper/lower
5917 : : halves against each other. */
5918 : 1846 : enum machine_mode mode1 = mode;
5919 : 1846 : tree stype = TREE_TYPE (vectype);
5920 : 1846 : if (compute_vectype != vectype)
5921 : : {
5922 : 456 : stype = unsigned_type_for (stype);
5923 : 456 : gimple_seq stmts = NULL;
5924 : 982 : for (unsigned i = 0; i < reduc_inputs.length (); ++i)
5925 : : {
5926 : 526 : tree new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
5927 : 526 : compute_vectype, reduc_inputs[i]);
5928 : 526 : reduc_inputs[i] = new_temp;
5929 : : }
5930 : 456 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5931 : : }
5932 : 1846 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5933 : 1846 : unsigned nunits1 = nunits;
5934 : 1846 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5935 : 1846 : && reduc_inputs.length () == 1)
5936 : : {
5937 : 41 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5938 : : /* For SLP reductions we have to make sure lanes match up, but
5939 : : since we're doing individual element final reduction reducing
5940 : : vector width here is even more important.
5941 : : ??? We can also separate lanes with permutes, for the common
5942 : : case of power-of-two group-size odd/even extracts would work. */
5943 : 41 : if (slp_reduc && nunits != nunits1)
5944 : : {
5945 : 41 : nunits1 = least_common_multiple (nunits1, group_size);
5946 : 82 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5947 : : }
5948 : : }
5949 : 1805 : else if (!slp_reduc
5950 : 1805 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5951 : 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5952 : :
5953 : 1846 : tree vectype1 = vectype;
5954 : 1846 : if (mode1 != mode)
5955 : : {
5956 : 47 : vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5957 : 47 : stype, nunits1);
5958 : : /* First reduce the vector to the desired vector size we should
5959 : : do shift reduction on by combining upper and lower halves. */
5960 : 47 : gimple_seq stmts = NULL;
5961 : 47 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5962 : : code, &stmts);
5963 : 47 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5964 : 47 : reduc_inputs[0] = new_temp;
5965 : : }
5966 : :
5967 : 1846 : reduce_with_shift = have_whole_vector_shift (mode1);
5968 : 733 : if (!VECTOR_MODE_P (mode1)
5969 : 2577 : || !directly_supported_p (code, vectype1))
5970 : : reduce_with_shift = false;
5971 : :
5972 : 1829 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
5973 : : {
5974 : 1601 : tree bitsize = TYPE_SIZE (TREE_TYPE (vectype1));
5975 : 1601 : int element_bitsize = tree_to_uhwi (bitsize);
5976 : : /* Enforced by vectorizable_reduction, which disallows SLP reductions
5977 : : for variable-length vectors and also requires direct target support
5978 : : for loop reductions. */
5979 : 1601 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5980 : 1601 : int nelements = vec_size_in_bits / element_bitsize;
5981 : 1601 : vec_perm_builder sel;
5982 : 1601 : vec_perm_indices indices;
5983 : :
5984 : 1601 : int elt_offset;
5985 : :
5986 : 1601 : tree zero_vec = build_zero_cst (vectype1);
5987 : : /* Case 2: Create:
5988 : : for (offset = nelements/2; offset >= 1; offset/=2)
5989 : : {
5990 : : Create: va' = vec_shift <va, offset>
5991 : : Create: va = vop <va, va'>
5992 : : } */
5993 : :
5994 : 1601 : if (dump_enabled_p ())
5995 : 349 : dump_printf_loc (MSG_NOTE, vect_location,
5996 : : "Reduce using vector shifts\n");
5997 : :
5998 : 1601 : gimple_seq stmts = NULL;
5999 : 1601 : new_temp = gimple_convert (&stmts, vectype1, reduc_inputs[0]);
6000 : 1601 : for (elt_offset = nelements / 2;
6001 : 3499 : elt_offset >= 1;
6002 : 1898 : elt_offset /= 2)
6003 : : {
6004 : 1898 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6005 : 1898 : indices.new_vector (sel, 2, nelements);
6006 : 1898 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6007 : 1898 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6008 : : new_temp, zero_vec, mask);
6009 : 1898 : new_temp = gimple_build (&stmts, code,
6010 : : vectype1, new_name, new_temp);
6011 : : }
6012 : :
6013 : : /* 2.4 Extract the final scalar result. Create:
6014 : : s_out3 = extract_field <v_out2, bitpos> */
6015 : :
6016 : 1601 : if (dump_enabled_p ())
6017 : 349 : dump_printf_loc (MSG_NOTE, vect_location,
6018 : : "extract scalar result\n");
6019 : :
6020 : 1601 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
6021 : 1601 : new_temp, bitsize, bitsize_zero_node);
6022 : 1601 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6023 : 1601 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6024 : 1601 : scalar_results.safe_push (new_temp);
6025 : 1601 : }
6026 : : else
6027 : : {
6028 : : /* Case 3: Create:
6029 : : s = extract_field <v_out2, 0>
6030 : : for (offset = element_size;
6031 : : offset < vector_size;
6032 : : offset += element_size;)
6033 : : {
6034 : : Create: s' = extract_field <v_out2, offset>
6035 : : Create: s = op <s, s'> // For non SLP cases
6036 : : } */
6037 : :
6038 : 245 : if (dump_enabled_p ())
6039 : 150 : dump_printf_loc (MSG_NOTE, vect_location,
6040 : : "Reduce using scalar code.\n");
6041 : :
6042 : 245 : tree compute_type = TREE_TYPE (vectype1);
6043 : 245 : unsigned vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6044 : 245 : unsigned element_bitsize = vector_element_bits (vectype1);
6045 : 245 : tree bitsize = bitsize_int (element_bitsize);
6046 : 245 : gimple_seq stmts = NULL;
6047 : 655 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6048 : : {
6049 : 410 : unsigned bit_offset;
6050 : 820 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6051 : 410 : vec_temp, bitsize, bitsize_zero_node);
6052 : :
6053 : : /* In SLP we don't need to apply reduction operation, so we just
6054 : : collect s' values in SCALAR_RESULTS. */
6055 : 410 : if (slp_reduc)
6056 : 400 : scalar_results.safe_push (new_temp);
6057 : :
6058 : 1004 : for (bit_offset = element_bitsize;
6059 : 1414 : bit_offset < vec_size_in_bits;
6060 : 1004 : bit_offset += element_bitsize)
6061 : : {
6062 : 1004 : tree bitpos = bitsize_int (bit_offset);
6063 : 1004 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6064 : : compute_type, vec_temp,
6065 : : bitsize, bitpos);
6066 : 1004 : if (slp_reduc)
6067 : : {
6068 : : /* In SLP we don't need to apply reduction operation, so
6069 : : we just collect s' values in SCALAR_RESULTS. */
6070 : 994 : new_temp = new_name;
6071 : 994 : scalar_results.safe_push (new_name);
6072 : : }
6073 : : else
6074 : 10 : new_temp = gimple_build (&stmts, code, compute_type,
6075 : : new_name, new_temp);
6076 : : }
6077 : : }
6078 : :
6079 : : /* The only case where we need to reduce scalar results in a SLP
6080 : : reduction, is unrolling. If the size of SCALAR_RESULTS is
6081 : : greater than GROUP_SIZE, we reduce them combining elements modulo
6082 : : GROUP_SIZE. */
6083 : 245 : if (slp_reduc)
6084 : : {
6085 : 235 : tree res, first_res, new_res;
6086 : :
6087 : : /* Reduce multiple scalar results in case of SLP unrolling. */
6088 : 929 : for (j = group_size; scalar_results.iterate (j, &res);
6089 : : j++)
6090 : : {
6091 : 694 : first_res = scalar_results[j % group_size];
6092 : 694 : new_res = gimple_build (&stmts, code, compute_type,
6093 : : first_res, res);
6094 : 694 : scalar_results[j % group_size] = new_res;
6095 : : }
6096 : 235 : scalar_results.truncate (group_size);
6097 : 1170 : for (k = 0; k < group_size; k++)
6098 : 1400 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6099 : 700 : scalar_results[k]);
6100 : : }
6101 : : else
6102 : : {
6103 : : /* Reduction chain - we have one scalar to keep in
6104 : : SCALAR_RESULTS. */
6105 : 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6106 : 10 : scalar_results.safe_push (new_temp);
6107 : : }
6108 : :
6109 : 245 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6110 : : }
6111 : :
6112 : 1846 : if ((VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6113 : 0 : && induc_val)
6114 : : {
6115 : : /* Earlier we set the initial value to be a vector if induc_val
6116 : : values. Check the result and if it is induc_val then replace
6117 : : with the original initial value, unless induc_val is
6118 : : the same as initial_def already. */
6119 : 0 : tree zcompare = make_ssa_name (boolean_type_node);
6120 : 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6121 : 0 : scalar_results[0], induc_val);
6122 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6123 : 0 : tree initial_def = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info)[0];
6124 : 0 : tree tmp = make_ssa_name (new_scalar_dest);
6125 : 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6126 : 0 : initial_def, scalar_results[0]);
6127 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6128 : 0 : scalar_results[0] = tmp;
6129 : : }
6130 : : }
6131 : :
6132 : : /* 2.5 Adjust the final result by the initial value of the reduction
6133 : : variable. (When such adjustment is not needed, then
6134 : : 'adjustment_def' is zero). For example, if code is PLUS we create:
6135 : : new_temp = loop_exit_def + adjustment_def */
6136 : :
6137 : 22000 : if (adjustment_def)
6138 : : {
6139 : 15917 : gcc_assert (!slp_reduc || group_size == 1);
6140 : 15917 : gimple_seq stmts = NULL;
6141 : 15917 : if (double_reduc)
6142 : : {
6143 : 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6144 : 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6145 : 0 : new_temp = gimple_build (&stmts, code, vectype,
6146 : 0 : reduc_inputs[0], adjustment_def);
6147 : : }
6148 : : else
6149 : : {
6150 : 15917 : new_temp = scalar_results[0];
6151 : 15917 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6152 : 15917 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6153 : : adjustment_def);
6154 : 15917 : new_temp = gimple_convert (&stmts, TREE_TYPE (compute_vectype),
6155 : : new_temp);
6156 : 15917 : new_temp = gimple_build (&stmts, code, TREE_TYPE (compute_vectype),
6157 : : new_temp, adjustment_def);
6158 : 15917 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6159 : : }
6160 : :
6161 : 15917 : epilog_stmt = gimple_seq_last_stmt (stmts);
6162 : 15917 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6163 : 15917 : scalar_results[0] = new_temp;
6164 : : }
6165 : :
6166 : : /* Record this operation if it could be reused by the epilogue loop. */
6167 : 22000 : if (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION
6168 : 22000 : && reduc_inputs.length () == 1)
6169 : 21814 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6170 : : { orig_reduc_input, reduc_info });
6171 : :
6172 : : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6173 : : phis with new adjusted scalar results, i.e., replace use <s_out0>
6174 : : with use <s_out4>.
6175 : :
6176 : : Transform:
6177 : : loop_exit:
6178 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6179 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6180 : : v_out2 = reduce <v_out1>
6181 : : s_out3 = extract_field <v_out2, 0>
6182 : : s_out4 = adjust_result <s_out3>
6183 : : use <s_out0>
6184 : : use <s_out0>
6185 : :
6186 : : into:
6187 : :
6188 : : loop_exit:
6189 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6190 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6191 : : v_out2 = reduce <v_out1>
6192 : : s_out3 = extract_field <v_out2, 0>
6193 : : s_out4 = adjust_result <s_out3>
6194 : : use <s_out4>
6195 : : use <s_out4> */
6196 : :
6197 : 44000 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6198 : 22000 : auto_vec<gimple *> phis;
6199 : 44465 : for (k = 0; k < live_out_stmts.size (); k++)
6200 : : {
6201 : 22465 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6202 : 22465 : tree scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6203 : :
6204 : : /* Find the loop-closed-use at the loop exit of the original scalar
6205 : : result. (The reduction result is expected to have two immediate uses,
6206 : : one at the latch block, and one at the loop exit). Note with
6207 : : early break we can have two exit blocks, so pick the correct PHI. */
6208 : 114280 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6209 : 69350 : if (!is_gimple_debug (USE_STMT (use_p))
6210 : 69350 : && !flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6211 : : {
6212 : 22460 : gcc_assert (is_a <gphi *> (USE_STMT (use_p)));
6213 : 22460 : if (gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6214 : 22452 : phis.safe_push (USE_STMT (use_p));
6215 : 22465 : }
6216 : :
6217 : 44917 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6218 : : {
6219 : : /* Replace the uses: */
6220 : 22452 : orig_name = PHI_RESULT (exit_phi);
6221 : :
6222 : : /* Look for a single use at the target of the skip edge. */
6223 : 22452 : if (unify_with_main_loop_p)
6224 : : {
6225 : 38 : use_operand_p use_p;
6226 : 38 : gimple *user;
6227 : 38 : if (!single_imm_use (orig_name, &use_p, &user))
6228 : 0 : gcc_unreachable ();
6229 : 38 : orig_name = gimple_get_lhs (user);
6230 : : }
6231 : :
6232 : 22452 : scalar_result = scalar_results[k];
6233 : 83357 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6234 : : {
6235 : 115403 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6236 : 38475 : SET_USE (use_p, scalar_result);
6237 : 38453 : update_stmt (use_stmt);
6238 : 22452 : }
6239 : : }
6240 : :
6241 : 22465 : phis.truncate (0);
6242 : : }
6243 : 22000 : }
6244 : :
6245 : : /* Return a vector of type VECTYPE that is equal to the vector select
6246 : : operation "MASK ? VEC : IDENTITY". Insert the select statements
6247 : : before GSI. */
6248 : :
6249 : : static tree
6250 : 9 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6251 : : tree vec, tree identity)
6252 : : {
6253 : 9 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6254 : 9 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6255 : : mask, vec, identity);
6256 : 9 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6257 : 9 : return cond;
6258 : : }
6259 : :
6260 : : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6261 : : order, starting with LHS. Insert the extraction statements before GSI and
6262 : : associate the new scalar SSA names with variable SCALAR_DEST.
6263 : : If MASK is nonzero mask the input and then operate on it unconditionally.
6264 : : Return the SSA name for the result. */
6265 : :
6266 : : static tree
6267 : 1043 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6268 : : tree_code code, tree lhs, tree vector_rhs,
6269 : : tree mask)
6270 : : {
6271 : 1043 : tree vectype = TREE_TYPE (vector_rhs);
6272 : 1043 : tree scalar_type = TREE_TYPE (vectype);
6273 : 1043 : tree bitsize = TYPE_SIZE (scalar_type);
6274 : 1043 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6275 : 1043 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6276 : :
6277 : : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6278 : : to perform an unconditional element-wise reduction of it. */
6279 : 1043 : if (mask)
6280 : : {
6281 : 45 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6282 : : "masked_vector_rhs");
6283 : 45 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6284 : : false);
6285 : 45 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
6286 : 45 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6287 : : mask, vector_rhs, vector_identity);
6288 : 45 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6289 : 45 : vector_rhs = masked_vector_rhs;
6290 : : }
6291 : :
6292 : 1043 : for (unsigned HOST_WIDE_INT bit_offset = 0;
6293 : 4647 : bit_offset < vec_size_in_bits;
6294 : 3604 : bit_offset += element_bitsize)
6295 : : {
6296 : 3604 : tree bitpos = bitsize_int (bit_offset);
6297 : 3604 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6298 : : bitsize, bitpos);
6299 : :
6300 : 3604 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6301 : 3604 : rhs = make_ssa_name (scalar_dest, stmt);
6302 : 3604 : gimple_assign_set_lhs (stmt, rhs);
6303 : 3604 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6304 : : /* Fold the vector extract, combining it with a previous reversal
6305 : : like seen in PR90579. */
6306 : 3604 : auto gsi2 = gsi_for_stmt (stmt);
6307 : 3604 : if (fold_stmt (&gsi2, follow_all_ssa_edges))
6308 : 356 : update_stmt (gsi_stmt (gsi2));
6309 : :
6310 : 3604 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6311 : 3604 : tree new_name = make_ssa_name (scalar_dest, stmt);
6312 : 3604 : gimple_assign_set_lhs (stmt, new_name);
6313 : 3604 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6314 : 3604 : lhs = new_name;
6315 : : }
6316 : 1043 : return lhs;
6317 : : }
6318 : :
6319 : : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6320 : : type of the vector input. */
6321 : :
6322 : : static internal_fn
6323 : 2520 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6324 : : {
6325 : 2520 : internal_fn mask_reduc_fn;
6326 : 2520 : internal_fn mask_len_reduc_fn;
6327 : :
6328 : 2520 : switch (reduc_fn)
6329 : : {
6330 : 0 : case IFN_FOLD_LEFT_PLUS:
6331 : 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6332 : 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6333 : 0 : break;
6334 : :
6335 : : default:
6336 : : return IFN_LAST;
6337 : : }
6338 : :
6339 : 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6340 : : OPTIMIZE_FOR_SPEED))
6341 : : return mask_reduc_fn;
6342 : 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6343 : : OPTIMIZE_FOR_SPEED))
6344 : : return mask_len_reduc_fn;
6345 : : return IFN_LAST;
6346 : : }
6347 : :
6348 : : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6349 : : statement that sets the live-out value. REDUC_DEF_STMT is the phi
6350 : : statement. CODE is the operation performed by STMT_INFO and OPS are
6351 : : its scalar operands. REDUC_INDEX is the index of the operand in
6352 : : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6353 : : implements in-order reduction, or IFN_LAST if we should open-code it.
6354 : : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6355 : : that should be used to control the operation in a fully-masked loop. */
6356 : :
6357 : : static bool
6358 : 830 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6359 : : stmt_vec_info stmt_info,
6360 : : gimple_stmt_iterator *gsi,
6361 : : slp_tree slp_node,
6362 : : code_helper code, internal_fn reduc_fn,
6363 : : int num_ops, tree vectype_in,
6364 : : int reduc_index, vec_loop_masks *masks,
6365 : : vec_loop_lens *lens)
6366 : : {
6367 : 830 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6368 : 830 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
6369 : 830 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6370 : :
6371 : 830 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6372 : :
6373 : 830 : bool is_cond_op = false;
6374 : 830 : if (!code.is_tree_code ())
6375 : : {
6376 : 15 : code = conditional_internal_fn_code (internal_fn (code));
6377 : 15 : gcc_assert (code != ERROR_MARK);
6378 : : is_cond_op = true;
6379 : : }
6380 : :
6381 : 830 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
6382 : :
6383 : 830 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6384 : : TYPE_VECTOR_SUBPARTS (vectype_in)));
6385 : :
6386 : : /* ??? We should, when transforming the cycle PHI, record the existing
6387 : : scalar def as vector def so looking up the vector def works. This
6388 : : would also allow generalizing this for reduction paths of length > 1
6389 : : and/or SLP reductions. */
6390 : 830 : slp_tree reduc_node = SLP_TREE_CHILDREN (slp_node)[reduc_index];
6391 : 830 : stmt_vec_info reduc_var_def = SLP_TREE_SCALAR_STMTS (reduc_node)[0];
6392 : 830 : tree reduc_var = gimple_get_lhs (STMT_VINFO_STMT (reduc_var_def));
6393 : :
6394 : : /* The operands either come from a binary operation or an IFN_COND operation.
6395 : : The former is a gimple assign with binary rhs and the latter is a
6396 : : gimple call with four arguments. */
6397 : 830 : gcc_assert (num_ops == 2 || num_ops == 4);
6398 : :
6399 : 830 : auto_vec<tree> vec_oprnds0, vec_opmask;
6400 : 830 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
6401 : 830 : + (1 - reduc_index)],
6402 : : &vec_oprnds0);
6403 : : /* For an IFN_COND_OP we also need the vector mask operand. */
6404 : 830 : if (is_cond_op)
6405 : 15 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
6406 : :
6407 : : /* The transform below relies on preserving the original scalar PHI
6408 : : and its latch def which we replace. So work backwards from there. */
6409 : 830 : tree scalar_dest
6410 : 830 : = gimple_phi_arg_def_from_edge (as_a <gphi *> (STMT_VINFO_STMT
6411 : : (reduc_var_def)),
6412 : 830 : loop_latch_edge (loop));
6413 : 830 : stmt_vec_info scalar_dest_def_info
6414 : 830 : = vect_stmt_to_vectorize (loop_vinfo->lookup_def (scalar_dest));
6415 : 830 : tree scalar_type = TREE_TYPE (scalar_dest);
6416 : :
6417 : 830 : int vec_num = vec_oprnds0.length ();
6418 : 830 : tree vec_elem_type = TREE_TYPE (vectype_out);
6419 : 830 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6420 : :
6421 : 830 : tree vector_identity = NULL_TREE;
6422 : 830 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6423 : : {
6424 : 2 : vector_identity = build_zero_cst (vectype_out);
6425 : 2 : if (!HONOR_SIGNED_ZEROS (vectype_out))
6426 : : ;
6427 : : else
6428 : : {
6429 : 2 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6430 : 2 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6431 : : vector_identity);
6432 : : }
6433 : : }
6434 : :
6435 : 830 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6436 : 830 : int i;
6437 : 830 : tree def0;
6438 : 1873 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6439 : : {
6440 : 1043 : gimple *new_stmt;
6441 : 1043 : tree mask = NULL_TREE;
6442 : 1043 : tree len = NULL_TREE;
6443 : 1043 : tree bias = NULL_TREE;
6444 : 1043 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6445 : : {
6446 : 9 : tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
6447 : : vec_num, vectype_in, i);
6448 : 9 : if (is_cond_op)
6449 : 9 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
6450 : 9 : loop_mask, vec_opmask[i], gsi);
6451 : : else
6452 : : mask = loop_mask;
6453 : : }
6454 : 1034 : else if (is_cond_op)
6455 : 36 : mask = vec_opmask[i];
6456 : 1043 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6457 : : {
6458 : 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6459 : : i, 1);
6460 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6461 : 0 : bias = build_int_cst (intQI_type_node, biasval);
6462 : 0 : if (!is_cond_op)
6463 : 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
6464 : : }
6465 : :
6466 : : /* Handle MINUS by adding the negative. */
6467 : 1043 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6468 : : {
6469 : 0 : tree negated = make_ssa_name (vectype_out);
6470 : 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6471 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6472 : 0 : def0 = negated;
6473 : : }
6474 : :
6475 : 9 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
6476 : 1052 : && mask && mask_reduc_fn == IFN_LAST)
6477 : 9 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6478 : : vector_identity);
6479 : :
6480 : : /* On the first iteration the input is simply the scalar phi
6481 : : result, and for subsequent iterations it is the output of
6482 : : the preceding operation. */
6483 : 1043 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6484 : : {
6485 : 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6486 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6487 : : def0, mask, len, bias);
6488 : 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6489 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6490 : : def0, mask);
6491 : : else
6492 : 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6493 : : def0);
6494 : : /* For chained SLP reductions the output of the previous reduction
6495 : : operation serves as the input of the next. For the final statement
6496 : : the output cannot be a temporary - we reuse the original
6497 : : scalar destination of the last statement. */
6498 : 0 : if (i != vec_num - 1)
6499 : : {
6500 : 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
6501 : 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6502 : 0 : gimple_set_lhs (new_stmt, reduc_var);
6503 : : }
6504 : : }
6505 : : else
6506 : : {
6507 : 1043 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
6508 : : tree_code (code), reduc_var, def0,
6509 : : mask);
6510 : 1043 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6511 : : /* Remove the statement, so that we can use the same code paths
6512 : : as for statements that we've just created. */
6513 : 1043 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6514 : 1043 : gsi_remove (&tmp_gsi, true);
6515 : : }
6516 : :
6517 : 1043 : if (i == vec_num - 1)
6518 : : {
6519 : 830 : gimple_set_lhs (new_stmt, scalar_dest);
6520 : 830 : vect_finish_replace_stmt (loop_vinfo,
6521 : : scalar_dest_def_info,
6522 : : new_stmt);
6523 : : }
6524 : : else
6525 : 213 : vect_finish_stmt_generation (loop_vinfo,
6526 : : scalar_dest_def_info,
6527 : : new_stmt, gsi);
6528 : :
6529 : 1043 : slp_node->push_vec_def (new_stmt);
6530 : : }
6531 : :
6532 : 830 : return true;
6533 : 830 : }
6534 : :
6535 : : /* Function is_nonwrapping_integer_induction.
6536 : :
6537 : : Check if STMT_VINO (which is part of loop LOOP) both increments and
6538 : : does not cause overflow. */
6539 : :
6540 : : static bool
6541 : 407 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6542 : : {
6543 : 407 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6544 : 407 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6545 : 407 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6546 : 407 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6547 : 407 : widest_int ni, max_loop_value, lhs_max;
6548 : 407 : wi::overflow_type overflow = wi::OVF_NONE;
6549 : :
6550 : : /* Make sure the loop is integer based. */
6551 : 407 : if (TREE_CODE (base) != INTEGER_CST
6552 : 108 : || TREE_CODE (step) != INTEGER_CST)
6553 : : return false;
6554 : :
6555 : : /* Check that the max size of the loop will not wrap. */
6556 : :
6557 : 108 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6558 : : return true;
6559 : :
6560 : 8 : if (! max_stmt_executions (loop, &ni))
6561 : : return false;
6562 : :
6563 : 8 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6564 : 8 : &overflow);
6565 : 8 : if (overflow)
6566 : : return false;
6567 : :
6568 : 8 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6569 : 16 : TYPE_SIGN (lhs_type), &overflow);
6570 : 8 : if (overflow)
6571 : : return false;
6572 : :
6573 : 8 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6574 : 8 : <= TYPE_PRECISION (lhs_type));
6575 : 407 : }
6576 : :
6577 : : /* Check if masking can be supported by inserting a conditional expression.
6578 : : CODE is the code for the operation. COND_FN is the conditional internal
6579 : : function, if it exists. VECTYPE_IN is the type of the vector input. */
6580 : : static bool
6581 : 5269 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6582 : : tree vectype_in)
6583 : : {
6584 : 5269 : if (cond_fn != IFN_LAST
6585 : 5269 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
6586 : : OPTIMIZE_FOR_SPEED))
6587 : : return false;
6588 : :
6589 : 3767 : if (code.is_tree_code ())
6590 : 3765 : switch (tree_code (code))
6591 : : {
6592 : : case DOT_PROD_EXPR:
6593 : : case SAD_EXPR:
6594 : : return true;
6595 : :
6596 : : default:
6597 : : break;
6598 : : }
6599 : : return false;
6600 : : }
6601 : :
6602 : : /* Insert a conditional expression to enable masked vectorization. CODE is the
6603 : : code for the operation. VOP is the array of operands. MASK is the loop
6604 : : mask. GSI is a statement iterator used to place the new conditional
6605 : : expression. */
6606 : : static void
6607 : 4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6608 : : gimple_stmt_iterator *gsi)
6609 : : {
6610 : 4 : switch (tree_code (code))
6611 : : {
6612 : 4 : case DOT_PROD_EXPR:
6613 : 4 : {
6614 : 4 : tree vectype = TREE_TYPE (vop[1]);
6615 : 4 : tree zero = build_zero_cst (vectype);
6616 : 4 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6617 : 4 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6618 : : mask, vop[1], zero);
6619 : 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6620 : 4 : vop[1] = masked_op1;
6621 : 4 : break;
6622 : : }
6623 : :
6624 : 0 : case SAD_EXPR:
6625 : 0 : {
6626 : 0 : tree vectype = TREE_TYPE (vop[1]);
6627 : 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6628 : 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6629 : : mask, vop[1], vop[0]);
6630 : 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6631 : 0 : vop[1] = masked_op1;
6632 : 0 : break;
6633 : : }
6634 : :
6635 : 0 : default:
6636 : 0 : gcc_unreachable ();
6637 : : }
6638 : 4 : }
6639 : :
6640 : : /* Given an operation with CODE in loop reduction path whose reduction PHI is
6641 : : specified by REDUC_INFO, the operation has TYPE of scalar result, and its
6642 : : input vectype is represented by VECTYPE_IN. The vectype of vectorized result
6643 : : may be different from VECTYPE_IN, either in base type or vectype lanes,
6644 : : lane-reducing operation is the case. This function check if it is possible,
6645 : : and how to perform partial vectorization on the operation in the context
6646 : : of LOOP_VINFO. */
6647 : :
6648 : : static void
6649 : 3392 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
6650 : : vect_reduc_info reduc_info,
6651 : : slp_tree slp_node,
6652 : : code_helper code, tree type,
6653 : : tree vectype_in)
6654 : : {
6655 : 3392 : enum vect_reduction_type reduc_type = VECT_REDUC_INFO_TYPE (reduc_info);
6656 : 3392 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
6657 : 3392 : internal_fn cond_fn
6658 : 920 : = ((code.is_internal_fn ()
6659 : 920 : && internal_fn_mask_index ((internal_fn)code) != -1)
6660 : 3392 : ? (internal_fn)code : get_conditional_internal_fn (code, type));
6661 : :
6662 : 3392 : if (reduc_type != FOLD_LEFT_REDUCTION
6663 : 2717 : && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6664 : 6066 : && (cond_fn == IFN_LAST
6665 : 2674 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6666 : : OPTIMIZE_FOR_SPEED)))
6667 : : {
6668 : 1702 : if (dump_enabled_p ())
6669 : 97 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6670 : : "can't operate on partial vectors because"
6671 : : " no conditional operation is available.\n");
6672 : 1702 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6673 : : }
6674 : 1690 : else if (reduc_type == FOLD_LEFT_REDUCTION
6675 : 1690 : && reduc_fn == IFN_LAST
6676 : 1690 : && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
6677 : : {
6678 : 0 : if (dump_enabled_p ())
6679 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6680 : : "can't operate on partial vectors because"
6681 : : " no conditional operation is available.\n");
6682 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6683 : : }
6684 : 1690 : else if (reduc_type == FOLD_LEFT_REDUCTION
6685 : 675 : && internal_fn_mask_index (reduc_fn) == -1
6686 : 675 : && FLOAT_TYPE_P (vectype_in)
6687 : 2360 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
6688 : : {
6689 : 0 : if (dump_enabled_p ())
6690 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6691 : : "can't operate on partial vectors because"
6692 : : " signed zeros cannot be preserved.\n");
6693 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6694 : : }
6695 : : else
6696 : : {
6697 : 1690 : internal_fn mask_reduc_fn
6698 : 1690 : = get_masked_reduction_fn (reduc_fn, vectype_in);
6699 : 1690 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6700 : 1690 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
6701 : 1690 : unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node);
6702 : :
6703 : 1690 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6704 : 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
6705 : : else
6706 : 1690 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
6707 : : }
6708 : 3392 : }
6709 : :
6710 : : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
6711 : : the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
6712 : : and the analysis is for slp if SLP_NODE is not NULL.
6713 : :
6714 : : For a lane-reducing operation, the loop reduction path that it lies in,
6715 : : may contain normal operation, or other lane-reducing operation of different
6716 : : input type size, an example as:
6717 : :
6718 : : int sum = 0;
6719 : : for (i)
6720 : : {
6721 : : ...
6722 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
6723 : : sum += w[i]; // widen-sum <vector(16) char>
6724 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
6725 : : sum += n[i]; // normal <vector(4) int>
6726 : : ...
6727 : : }
6728 : :
6729 : : Vectorization factor is essentially determined by operation whose input
6730 : : vectype has the most lanes ("vector(16) char" in the example), while we
6731 : : need to choose input vectype with the least lanes ("vector(4) int" in the
6732 : : example) to determine effective number of vector reduction PHIs. */
6733 : :
6734 : : bool
6735 : 308283 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
6736 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
6737 : : {
6738 : 308283 : gimple *stmt = stmt_info->stmt;
6739 : :
6740 : 308283 : if (!lane_reducing_stmt_p (stmt))
6741 : : return false;
6742 : :
6743 : 456 : tree type = TREE_TYPE (gimple_assign_lhs (stmt));
6744 : :
6745 : 456 : if (!INTEGRAL_TYPE_P (type))
6746 : : return false;
6747 : :
6748 : : /* Do not try to vectorize bit-precision reductions. */
6749 : 456 : if (!type_has_mode_precision_p (type))
6750 : : return false;
6751 : :
6752 : 456 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6753 : :
6754 : : /* TODO: Support lane-reducing operation that does not directly participate
6755 : : in loop reduction. */
6756 : 456 : if (!reduc_info)
6757 : : return false;
6758 : :
6759 : : /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
6760 : : recoginized. */
6761 : 456 : gcc_assert (!nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo), stmt_info));
6762 : 456 : gcc_assert (VECT_REDUC_INFO_TYPE (reduc_info) == TREE_CODE_REDUCTION);
6763 : :
6764 : 1824 : for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
6765 : : {
6766 : 1368 : slp_tree slp_op;
6767 : 1368 : tree op;
6768 : 1368 : tree vectype;
6769 : 1368 : enum vect_def_type dt;
6770 : :
6771 : 1368 : if (!vect_is_simple_use (loop_vinfo, slp_node, i, &op,
6772 : : &slp_op, &dt, &vectype))
6773 : : {
6774 : 0 : if (dump_enabled_p ())
6775 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6776 : : "use not simple.\n");
6777 : 0 : return false;
6778 : : }
6779 : :
6780 : 1368 : if (!vectype)
6781 : : {
6782 : 6 : vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
6783 : : slp_op);
6784 : 6 : if (!vectype)
6785 : : return false;
6786 : : }
6787 : :
6788 : 1368 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
6789 : : {
6790 : 0 : if (dump_enabled_p ())
6791 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6792 : : "incompatible vector types for invariants\n");
6793 : 0 : return false;
6794 : : }
6795 : :
6796 : 1368 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6797 : 456 : continue;
6798 : :
6799 : : /* There should be at most one cycle def in the stmt. */
6800 : 912 : if (VECTORIZABLE_CYCLE_DEF (dt))
6801 : : return false;
6802 : : }
6803 : :
6804 : 456 : slp_tree node_in = SLP_TREE_CHILDREN (slp_node)[0];
6805 : 456 : tree vectype_in = SLP_TREE_VECTYPE (node_in);
6806 : 456 : gcc_assert (vectype_in);
6807 : :
6808 : : /* Compute number of effective vector statements for costing. */
6809 : 456 : unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, node_in);
6810 : 456 : gcc_assert (ncopies_for_cost >= 1);
6811 : :
6812 : 456 : if (vect_is_emulated_mixed_dot_prod (slp_node))
6813 : : {
6814 : : /* We need extra two invariants: one that contains the minimum signed
6815 : : value and one that contains half of its negative. */
6816 : 11 : int prologue_stmts = 2;
6817 : 11 : unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
6818 : : scalar_to_vec, slp_node, 0,
6819 : : vect_prologue);
6820 : 11 : if (dump_enabled_p ())
6821 : 0 : dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
6822 : : "extra prologue_cost = %d .\n", cost);
6823 : :
6824 : : /* Three dot-products and a subtraction. */
6825 : 11 : ncopies_for_cost *= 4;
6826 : : }
6827 : :
6828 : 456 : record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, slp_node,
6829 : : 0, vect_body);
6830 : :
6831 : 456 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
6832 : : {
6833 : 43 : enum tree_code code = gimple_assign_rhs_code (stmt);
6834 : 43 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
6835 : 43 : node_in, code, type,
6836 : : vectype_in);
6837 : : }
6838 : :
6839 : : /* Transform via vect_transform_reduction. */
6840 : 456 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6841 : 456 : return true;
6842 : : }
6843 : :
6844 : : /* Function vectorizable_reduction.
6845 : :
6846 : : Check if STMT_INFO performs a reduction operation that can be vectorized.
6847 : : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6848 : : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6849 : : Return true if STMT_INFO is vectorizable in this way.
6850 : :
6851 : : This function also handles reduction idioms (patterns) that have been
6852 : : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6853 : : may be of this form:
6854 : : X = pattern_expr (arg0, arg1, ..., X)
6855 : : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6856 : : sequence that had been detected and replaced by the pattern-stmt
6857 : : (STMT_INFO).
6858 : :
6859 : : This function also handles reduction of condition expressions, for example:
6860 : : for (int i = 0; i < N; i++)
6861 : : if (a[i] < value)
6862 : : last = a[i];
6863 : : This is handled by vectorising the loop and creating an additional vector
6864 : : containing the loop indexes for which "a[i] < value" was true. In the
6865 : : function epilogue this is reduced to a single max value and then used to
6866 : : index into the vector of results.
6867 : :
6868 : : In some cases of reduction patterns, the type of the reduction variable X is
6869 : : different than the type of the other arguments of STMT_INFO.
6870 : : In such cases, the vectype that is used when transforming STMT_INFO into
6871 : : a vector stmt is different than the vectype that is used to determine the
6872 : : vectorization factor, because it consists of a different number of elements
6873 : : than the actual number of elements that are being operated upon in parallel.
6874 : :
6875 : : For example, consider an accumulation of shorts into an int accumulator.
6876 : : On some targets it's possible to vectorize this pattern operating on 8
6877 : : shorts at a time (hence, the vectype for purposes of determining the
6878 : : vectorization factor should be V8HI); on the other hand, the vectype that
6879 : : is used to create the vector form is actually V4SI (the type of the result).
6880 : :
6881 : : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6882 : : indicates what is the actual level of parallelism (V8HI in the example), so
6883 : : that the right vectorization factor would be derived. This vectype
6884 : : corresponds to the type of arguments to the reduction stmt, and should *NOT*
6885 : : be used to create the vectorized stmt. The right vectype for the vectorized
6886 : : stmt is obtained from the type of the result X:
6887 : : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6888 : :
6889 : : This means that, contrary to "regular" reductions (or "regular" stmts in
6890 : : general), the following equation:
6891 : : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6892 : : does *NOT* necessarily hold for reduction patterns. */
6893 : :
6894 : : bool
6895 : 307827 : vectorizable_reduction (loop_vec_info loop_vinfo,
6896 : : stmt_vec_info stmt_info, slp_tree slp_node,
6897 : : slp_instance slp_node_instance,
6898 : : stmt_vector_for_cost *cost_vec)
6899 : : {
6900 : 307827 : tree vectype_in = NULL_TREE;
6901 : 307827 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6902 : 307827 : stmt_vec_info cond_stmt_vinfo = NULL;
6903 : 307827 : int i;
6904 : 307827 : int ncopies;
6905 : 307827 : bool single_defuse_cycle = false;
6906 : 307827 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6907 : 307827 : tree cond_reduc_val = NULL_TREE;
6908 : :
6909 : : /* Make sure it was already recognized as a reduction computation. */
6910 : 307827 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6911 : : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6912 : 307827 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6913 : : return false;
6914 : :
6915 : : /* The reduction meta. */
6916 : 57079 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
6917 : :
6918 : 57079 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6919 : : {
6920 : 1339 : gcc_assert (is_a <gphi *> (stmt_info->stmt));
6921 : : /* We eventually need to set a vector type on invariant arguments. */
6922 : : unsigned j;
6923 : : slp_tree child;
6924 : 4017 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6925 : 2678 : if (!vect_maybe_update_slp_op_vectype (child,
6926 : : SLP_TREE_VECTYPE (slp_node)))
6927 : : {
6928 : 0 : if (dump_enabled_p ())
6929 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6930 : : "incompatible vector types for "
6931 : : "invariants\n");
6932 : 0 : return false;
6933 : : }
6934 : : /* Analysis for double-reduction is done on the outer
6935 : : loop PHI, nested cycles have no further restrictions. */
6936 : 1339 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
6937 : 1339 : return true;
6938 : : }
6939 : :
6940 : 55740 : if (!is_a <gphi *> (stmt_info->stmt))
6941 : : {
6942 : 7018 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def);
6943 : 7018 : SLP_TREE_TYPE (slp_node) = reduc_vec_info_type;
6944 : 7018 : return true;
6945 : : }
6946 : :
6947 : 48722 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6948 : 48722 : stmt_vec_info phi_info = stmt_info;
6949 : 48722 : bool double_reduc = false;
6950 : 48722 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6951 : : {
6952 : : /* We arrive here for both the inner loop LC PHI and the
6953 : : outer loop PHI. The latter is what we want to analyze the
6954 : : reduction with. The LC PHI is handled by vectorizable_lc_phi. */
6955 : 266 : if (gimple_bb (stmt_info->stmt) != loop->header)
6956 : 0 : return false;
6957 : :
6958 : : /* Set loop and phi_info to the inner loop. */
6959 : 266 : use_operand_p use_p;
6960 : 266 : gimple *use_stmt;
6961 : 266 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6962 : : &use_p, &use_stmt);
6963 : 266 : gcc_assert (res);
6964 : 266 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
6965 : 266 : loop = loop->inner;
6966 : 266 : double_reduc = true;
6967 : : }
6968 : :
6969 : 48722 : const bool reduc_chain = reduc_info->is_reduc_chain;
6970 : 48722 : slp_node_instance->reduc_phis = slp_node;
6971 : : /* ??? We're leaving slp_node to point to the PHIs, we only
6972 : : need it to get at the number of vector stmts which wasn't
6973 : : yet initialized for the instance root. */
6974 : :
6975 : : /* PHIs should not participate in patterns. */
6976 : 48722 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6977 : 48722 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6978 : :
6979 : : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6980 : : and compute the reduction chain length. Discover the real
6981 : : reduction operation stmt on the way (slp_for_stmt_info). */
6982 : 48722 : unsigned reduc_chain_length = 0;
6983 : 48722 : stmt_info = NULL;
6984 : 48722 : slp_tree slp_for_stmt_info = NULL;
6985 : 48722 : slp_tree vdef_slp = slp_node_instance->root;
6986 : 107236 : while (vdef_slp != slp_node)
6987 : : {
6988 : 59266 : int reduc_idx = SLP_TREE_REDUC_IDX (vdef_slp);
6989 : 59266 : if (reduc_idx == -1)
6990 : : {
6991 : 744 : if (dump_enabled_p ())
6992 : 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6993 : : "reduction chain broken by patterns.\n");
6994 : 752 : return false;
6995 : : }
6996 : 58522 : stmt_vec_info vdef = SLP_TREE_REPRESENTATIVE (vdef_slp);
6997 : 58522 : if (is_a <gphi *> (vdef->stmt))
6998 : : {
6999 : 532 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7000 : : /* Do not count PHIs towards the chain length. */
7001 : 532 : continue;
7002 : : }
7003 : 57990 : gimple_match_op op;
7004 : 57990 : if (!gimple_extract_op (vdef->stmt, &op))
7005 : : {
7006 : 0 : if (dump_enabled_p ())
7007 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7008 : : "reduction chain includes unsupported"
7009 : : " statement type.\n");
7010 : 0 : return false;
7011 : : }
7012 : 57990 : if (CONVERT_EXPR_CODE_P (op.code))
7013 : : {
7014 : 3308 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7015 : : {
7016 : 8 : if (dump_enabled_p ())
7017 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7018 : : "conversion in the reduction chain.\n");
7019 : 8 : return false;
7020 : : }
7021 : 3300 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[0];
7022 : : }
7023 : : else
7024 : : {
7025 : : /* First non-conversion stmt. */
7026 : 54682 : if (!slp_for_stmt_info)
7027 : 47970 : slp_for_stmt_info = vdef_slp;
7028 : :
7029 : 54682 : if (lane_reducing_op_p (op.code))
7030 : : {
7031 : : /* The last operand of lane-reducing operation is for
7032 : : reduction. */
7033 : 456 : gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7034 : :
7035 : 456 : slp_tree op_node = SLP_TREE_CHILDREN (vdef_slp)[0];
7036 : 456 : tree vectype_op = SLP_TREE_VECTYPE (op_node);
7037 : 456 : tree type_op = TREE_TYPE (op.ops[0]);
7038 : 456 : if (!vectype_op)
7039 : : {
7040 : 9 : vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7041 : : type_op);
7042 : 9 : if (!vectype_op
7043 : 9 : || !vect_maybe_update_slp_op_vectype (op_node,
7044 : : vectype_op))
7045 : 0 : return false;
7046 : : }
7047 : :
7048 : : /* To accommodate lane-reducing operations of mixed input
7049 : : vectypes, choose input vectype with the least lanes for the
7050 : : reduction PHI statement, which would result in the most
7051 : : ncopies for vectorized reduction results. */
7052 : 456 : if (!vectype_in
7053 : 456 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7054 : 46 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7055 : 433 : vectype_in = vectype_op;
7056 : : }
7057 : 54226 : else if (!vectype_in)
7058 : 47537 : vectype_in = SLP_TREE_VECTYPE (slp_node);
7059 : 54682 : vdef_slp = SLP_TREE_CHILDREN (vdef_slp)[reduc_idx];
7060 : : }
7061 : 57982 : reduc_chain_length++;
7062 : : }
7063 : 47970 : stmt_info = SLP_TREE_REPRESENTATIVE (slp_for_stmt_info);
7064 : :
7065 : : /* PHIs should not participate in patterns. */
7066 : 47970 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7067 : :
7068 : : /* 1. Is vectorizable reduction? */
7069 : : /* Not supportable if the reduction variable is used in the loop, unless
7070 : : it's a reduction chain. */
7071 : 47970 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7072 : 0 : && !reduc_chain)
7073 : : return false;
7074 : :
7075 : : /* Reductions that are not used even in an enclosing outer-loop,
7076 : : are expected to be "live" (used out of the loop). */
7077 : 47970 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7078 : 0 : && !STMT_VINFO_LIVE_P (stmt_info))
7079 : : return false;
7080 : :
7081 : : /* 2. Has this been recognized as a reduction pattern?
7082 : :
7083 : : Check if STMT represents a pattern that has been recognized
7084 : : in earlier analysis stages. For stmts that represent a pattern,
7085 : : the STMT_VINFO_RELATED_STMT field records the last stmt in
7086 : : the original sequence that constitutes the pattern. */
7087 : :
7088 : 47970 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7089 : 47970 : if (orig_stmt_info)
7090 : : {
7091 : 3247 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7092 : 3247 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7093 : : }
7094 : :
7095 : : /* 3. Check the operands of the operation. The first operands are defined
7096 : : inside the loop body. The last operand is the reduction variable,
7097 : : which is defined by the loop-header-phi. */
7098 : :
7099 : 47970 : tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
7100 : 47970 : VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
7101 : :
7102 : 47970 : gimple_match_op op;
7103 : 47970 : if (!gimple_extract_op (stmt_info->stmt, &op))
7104 : 0 : gcc_unreachable ();
7105 : 47970 : bool lane_reducing = lane_reducing_op_p (op.code);
7106 : :
7107 : 47970 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7108 : 15140 : && !SCALAR_FLOAT_TYPE_P (op.type))
7109 : : return false;
7110 : :
7111 : : /* Do not try to vectorize bit-precision reductions. */
7112 : 47970 : if (!type_has_mode_precision_p (op.type)
7113 : 1477 : && op.code != BIT_AND_EXPR
7114 : 1411 : && op.code != BIT_IOR_EXPR
7115 : 48405 : && op.code != BIT_XOR_EXPR)
7116 : : return false;
7117 : :
7118 : : /* Lane-reducing ops also never can be used in a SLP reduction group
7119 : : since we'll mix lanes belonging to different reductions. But it's
7120 : : OK to use them in a reduction chain or when the reduction group
7121 : : has just one element. */
7122 : 47660 : if (lane_reducing
7123 : 47660 : && !reduc_chain
7124 : 406 : && SLP_TREE_LANES (slp_node) > 1)
7125 : : {
7126 : 0 : if (dump_enabled_p ())
7127 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7128 : : "lane-reducing reduction in reduction group.\n");
7129 : 0 : return false;
7130 : : }
7131 : :
7132 : : /* All uses but the last are expected to be defined in the loop.
7133 : : The last use is the reduction variable. In case of nested cycle this
7134 : : assumption is not true: we use reduc_index to record the index of the
7135 : : reduction variable. */
7136 : 47660 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7137 : 47660 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7138 : 47660 : gcc_assert (op.code != COND_EXPR || !COMPARISON_CLASS_P (op.ops[0]));
7139 : 151420 : for (i = 0; i < (int) op.num_ops; i++)
7140 : : {
7141 : : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7142 : 103760 : if (i == 0 && op.code == COND_EXPR)
7143 : 52055 : continue;
7144 : :
7145 : 102969 : stmt_vec_info def_stmt_info;
7146 : 102969 : enum vect_def_type dt;
7147 : 102969 : if (!vect_is_simple_use (loop_vinfo, slp_for_stmt_info,
7148 : : i, &op.ops[i], &slp_op[i], &dt,
7149 : 102969 : &vectype_op[i], &def_stmt_info))
7150 : : {
7151 : 0 : if (dump_enabled_p ())
7152 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7153 : : "use not simple.\n");
7154 : 0 : return false;
7155 : : }
7156 : :
7157 : : /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7158 : : reduction operand twice (once as definition, once as else). */
7159 : 102969 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]
7160 : 205938 : == SLP_TREE_CHILDREN
7161 : 102969 : (slp_for_stmt_info)[SLP_TREE_REDUC_IDX (slp_for_stmt_info)])
7162 : 51264 : continue;
7163 : :
7164 : : /* There should be only one cycle def in the stmt, the one
7165 : : leading to reduc_def. */
7166 : 51705 : if (SLP_TREE_CHILDREN (slp_for_stmt_info)[i]->cycle_info.id != -1)
7167 : : return false;
7168 : :
7169 : 51705 : if (!vectype_op[i])
7170 : 4427 : vectype_op[i]
7171 : 4427 : = get_vectype_for_scalar_type (loop_vinfo,
7172 : 4427 : TREE_TYPE (op.ops[i]), slp_op[i]);
7173 : :
7174 : : /* Record how the non-reduction-def value of COND_EXPR is defined.
7175 : : ??? For a chain of multiple CONDs we'd have to match them up all. */
7176 : 51705 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7177 : : {
7178 : 768 : if (dt == vect_constant_def)
7179 : : {
7180 : 95 : cond_reduc_dt = dt;
7181 : 95 : cond_reduc_val = op.ops[i];
7182 : : }
7183 : 673 : else if (dt == vect_induction_def
7184 : 407 : && def_stmt_info
7185 : 1080 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7186 : : {
7187 : 108 : cond_reduc_dt = dt;
7188 : 108 : cond_stmt_vinfo = def_stmt_info;
7189 : : }
7190 : : }
7191 : : }
7192 : :
7193 : 47660 : enum vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7194 : : /* If we have a condition reduction, see if we can simplify it further. */
7195 : 47660 : if (reduction_type == COND_REDUCTION)
7196 : : {
7197 : 779 : if (SLP_TREE_LANES (slp_node) != 1)
7198 : : return false;
7199 : :
7200 : : /* When the condition uses the reduction value in the condition, fail. */
7201 : 755 : if (SLP_TREE_REDUC_IDX (slp_node) == 0)
7202 : : {
7203 : 0 : if (dump_enabled_p ())
7204 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7205 : : "condition depends on previous iteration\n");
7206 : 0 : return false;
7207 : : }
7208 : :
7209 : 755 : if (reduc_chain_length == 1
7210 : 755 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7211 : : OPTIMIZE_FOR_SPEED)
7212 : 732 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7213 : : vectype_in,
7214 : : OPTIMIZE_FOR_SPEED)))
7215 : : {
7216 : 0 : if (dump_enabled_p ())
7217 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7218 : : "optimizing condition reduction with"
7219 : : " FOLD_EXTRACT_LAST.\n");
7220 : 0 : VECT_REDUC_INFO_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7221 : : }
7222 : 755 : else if (cond_reduc_dt == vect_induction_def)
7223 : : {
7224 : 108 : tree base
7225 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7226 : 108 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7227 : :
7228 : 108 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7229 : : && TREE_CODE (step) == INTEGER_CST);
7230 : 108 : cond_reduc_val = NULL_TREE;
7231 : 108 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7232 : 108 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7233 : 108 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7234 : : ;
7235 : : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7236 : : above base; punt if base is the minimum value of the type for
7237 : : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7238 : 96 : else if (tree_int_cst_sgn (step) == -1)
7239 : : {
7240 : 18 : cond_reduc_op_code = MIN_EXPR;
7241 : 18 : if (tree_int_cst_sgn (base) == -1)
7242 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7243 : 18 : else if (tree_int_cst_lt (base,
7244 : 18 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7245 : 18 : cond_reduc_val
7246 : 18 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7247 : : }
7248 : : else
7249 : : {
7250 : 78 : cond_reduc_op_code = MAX_EXPR;
7251 : 78 : if (tree_int_cst_sgn (base) == 1)
7252 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7253 : 78 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7254 : : base))
7255 : 78 : cond_reduc_val
7256 : 78 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7257 : : }
7258 : 96 : if (cond_reduc_val)
7259 : : {
7260 : 96 : if (dump_enabled_p ())
7261 : 60 : dump_printf_loc (MSG_NOTE, vect_location,
7262 : : "condition expression based on "
7263 : : "integer induction.\n");
7264 : 96 : VECT_REDUC_INFO_CODE (reduc_info) = cond_reduc_op_code;
7265 : 96 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info)
7266 : 96 : = cond_reduc_val;
7267 : 96 : VECT_REDUC_INFO_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7268 : : }
7269 : : }
7270 : 647 : else if (cond_reduc_dt == vect_constant_def)
7271 : : {
7272 : 85 : enum vect_def_type cond_initial_dt;
7273 : 85 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7274 : 85 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7275 : 85 : if (cond_initial_dt == vect_constant_def
7276 : 107 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7277 : 22 : TREE_TYPE (cond_reduc_val)))
7278 : : {
7279 : 22 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7280 : : cond_initial_val, cond_reduc_val);
7281 : 22 : if (e && (integer_onep (e) || integer_zerop (e)))
7282 : : {
7283 : 22 : if (dump_enabled_p ())
7284 : 16 : dump_printf_loc (MSG_NOTE, vect_location,
7285 : : "condition expression based on "
7286 : : "compile time constant.\n");
7287 : : /* Record reduction code at analysis stage. */
7288 : 22 : VECT_REDUC_INFO_CODE (reduc_info)
7289 : 22 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7290 : 22 : VECT_REDUC_INFO_TYPE (reduc_info) = CONST_COND_REDUCTION;
7291 : : }
7292 : : }
7293 : : }
7294 : : }
7295 : :
7296 : 47636 : if (STMT_VINFO_LIVE_P (phi_info))
7297 : : return false;
7298 : :
7299 : 47636 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
7300 : :
7301 : 47636 : gcc_assert (ncopies >= 1);
7302 : :
7303 : 47636 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7304 : :
7305 : : /* 4.2. Check support for the epilog operation.
7306 : :
7307 : : If STMT represents a reduction pattern, then the type of the
7308 : : reduction variable may be different than the type of the rest
7309 : : of the arguments. For example, consider the case of accumulation
7310 : : of shorts into an int accumulator; The original code:
7311 : : S1: int_a = (int) short_a;
7312 : : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7313 : :
7314 : : was replaced with:
7315 : : STMT: int_acc = widen_sum <short_a, int_acc>
7316 : :
7317 : : This means that:
7318 : : 1. The tree-code that is used to create the vector operation in the
7319 : : epilog code (that reduces the partial results) is not the
7320 : : tree-code of STMT, but is rather the tree-code of the original
7321 : : stmt from the pattern that STMT is replacing. I.e, in the example
7322 : : above we want to use 'widen_sum' in the loop, but 'plus' in the
7323 : : epilog.
7324 : : 2. The type (mode) we use to check available target support
7325 : : for the vector operation to be created in the *epilog*, is
7326 : : determined by the type of the reduction variable (in the example
7327 : : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7328 : : However the type (mode) we use to check available target support
7329 : : for the vector operation to be created *inside the loop*, is
7330 : : determined by the type of the other arguments to STMT (in the
7331 : : example we'd check this: optab_handler (widen_sum_optab,
7332 : : vect_short_mode)).
7333 : :
7334 : : This is contrary to "regular" reductions, in which the types of all
7335 : : the arguments are the same as the type of the reduction variable.
7336 : : For "regular" reductions we can therefore use the same vector type
7337 : : (and also the same tree-code) when generating the epilog code and
7338 : : when generating the code inside the loop. */
7339 : :
7340 : 47636 : code_helper orig_code = VECT_REDUC_INFO_CODE (reduc_info);
7341 : :
7342 : : /* If conversion might have created a conditional operation like
7343 : : IFN_COND_ADD already. Use the internal code for the following checks. */
7344 : 47636 : if (orig_code.is_internal_fn ())
7345 : : {
7346 : 3660 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7347 : 3660 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7348 : : }
7349 : :
7350 : 47636 : VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
7351 : :
7352 : 47636 : reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7353 : 47636 : if (reduction_type == TREE_CODE_REDUCTION)
7354 : : {
7355 : : /* Check whether it's ok to change the order of the computation.
7356 : : Generally, when vectorizing a reduction we change the order of the
7357 : : computation. This may change the behavior of the program in some
7358 : : cases, so we need to check that this is ok. One exception is when
7359 : : vectorizing an outer-loop: the inner-loop is executed sequentially,
7360 : : and therefore vectorizing reductions in the inner-loop during
7361 : : outer-loop vectorization is safe. Likewise when we are vectorizing
7362 : : a series of reductions using SLP and the VF is one the reductions
7363 : : are performed in scalar order. */
7364 : 46881 : if (!reduc_chain
7365 : 46881 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7366 : : ;
7367 : 46739 : else if (needs_fold_left_reduction_p (op.type, orig_code))
7368 : : {
7369 : : /* When vectorizing a reduction chain w/o SLP the reduction PHI
7370 : : is not directy used in stmt. */
7371 : 4793 : if (reduc_chain_length != 1)
7372 : : {
7373 : 67 : if (dump_enabled_p ())
7374 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7375 : : "in-order reduction chain without SLP.\n");
7376 : 67 : return false;
7377 : : }
7378 : : /* Code generation doesn't support function calls other
7379 : : than .COND_*. */
7380 : 4726 : if (!op.code.is_tree_code ()
7381 : 4840 : && !(op.code.is_internal_fn ()
7382 : 57 : && conditional_internal_fn_code (internal_fn (op.code))
7383 : : != ERROR_MARK))
7384 : : {
7385 : 10 : if (dump_enabled_p ())
7386 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7387 : : "in-order reduction chain operation not "
7388 : : "supported.\n");
7389 : 10 : return false;
7390 : : }
7391 : 4716 : VECT_REDUC_INFO_TYPE (reduc_info)
7392 : 4716 : = reduction_type = FOLD_LEFT_REDUCTION;
7393 : : }
7394 : 41946 : else if (!commutative_binary_op_p (orig_code, op.type)
7395 : 41946 : || !associative_binary_op_p (orig_code, op.type))
7396 : : {
7397 : 152 : if (dump_enabled_p ())
7398 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7399 : : "reduction: not commutative/associative\n");
7400 : 152 : return false;
7401 : : }
7402 : : }
7403 : :
7404 : 4716 : if ((reduction_type == COND_REDUCTION
7405 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7406 : : || reduction_type == CONST_COND_REDUCTION
7407 : 42691 : || reduction_type == EXTRACT_LAST_REDUCTION)
7408 : : && 1
7409 : 755 : && ncopies > 1)
7410 : : {
7411 : 276 : if (dump_enabled_p ())
7412 : 60 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7413 : : "multiple types in condition reduction.\n");
7414 : 276 : return false;
7415 : : }
7416 : :
7417 : : /* See if we can convert a mask vector to a corresponding bool data vector
7418 : : to perform the epilogue reduction. */
7419 : 47131 : tree alt_vectype_out = NULL_TREE;
7420 : 47131 : if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
7421 : : {
7422 : 944 : alt_vectype_out
7423 : 1888 : = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
7424 : 944 : TREE_TYPE (vectype_out),
7425 : : TYPE_VECTOR_SUBPARTS
7426 : : (vectype_out));
7427 : 944 : if (!alt_vectype_out
7428 : 944 : || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
7429 : 1873 : TYPE_VECTOR_SUBPARTS (vectype_out))
7430 : 1888 : || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
7431 : 15 : alt_vectype_out = NULL_TREE;
7432 : : }
7433 : :
7434 : 47131 : internal_fn reduc_fn = IFN_LAST;
7435 : 47131 : if (reduction_type == TREE_CODE_REDUCTION
7436 : 47131 : || reduction_type == FOLD_LEFT_REDUCTION
7437 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7438 : 479 : || reduction_type == CONST_COND_REDUCTION)
7439 : : {
7440 : 42046 : if (reduction_type == FOLD_LEFT_REDUCTION
7441 : 50692 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
7442 : 42046 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7443 : : {
7444 : 46086 : internal_fn sbool_fn = IFN_LAST;
7445 : 46086 : if (reduc_fn == IFN_LAST)
7446 : : ;
7447 : 44268 : else if ((!VECTOR_BOOLEAN_TYPE_P (vectype_out)
7448 : 944 : || (GET_MODE_CLASS (TYPE_MODE (vectype_out))
7449 : : == MODE_VECTOR_BOOL))
7450 : 87592 : && direct_internal_fn_supported_p (reduc_fn, vectype_out,
7451 : : OPTIMIZE_FOR_SPEED))
7452 : : ;
7453 : 10105 : else if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
7454 : 944 : && sbool_reduction_fn_for_fn (reduc_fn, &sbool_fn)
7455 : 11049 : && direct_internal_fn_supported_p (sbool_fn, vectype_out,
7456 : : OPTIMIZE_FOR_SPEED))
7457 : 65 : reduc_fn = sbool_fn;
7458 : 10040 : else if (reduction_type != FOLD_LEFT_REDUCTION
7459 : 10040 : && alt_vectype_out
7460 : 10040 : && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
7461 : : OPTIMIZE_FOR_SPEED))
7462 : 714 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7463 : : else
7464 : : {
7465 : 9326 : if (dump_enabled_p ())
7466 : 797 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7467 : : "reduc op not supported by target.\n");
7468 : :
7469 : 9326 : reduc_fn = IFN_LAST;
7470 : : }
7471 : : }
7472 : : else
7473 : : {
7474 : 676 : if (dump_enabled_p ())
7475 : 48 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7476 : : "no reduc code for scalar code.\n");
7477 : :
7478 : 676 : return false;
7479 : : }
7480 : 46086 : if (reduc_fn == IFN_LAST
7481 : 46086 : && VECTOR_BOOLEAN_TYPE_P (vectype_out))
7482 : : {
7483 : 165 : if (!alt_vectype_out)
7484 : : {
7485 : 8 : if (dump_enabled_p ())
7486 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7487 : : "cannot turn mask into bool data vector for "
7488 : : "reduction epilogue.\n");
7489 : 8 : return false;
7490 : : }
7491 : 157 : VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
7492 : : }
7493 : : }
7494 : 369 : else if (reduction_type == COND_REDUCTION)
7495 : : {
7496 : 369 : int scalar_precision
7497 : 369 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7498 : 369 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
7499 : 369 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7500 : : vectype_out);
7501 : :
7502 : 369 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7503 : : OPTIMIZE_FOR_SPEED))
7504 : 8 : reduc_fn = IFN_REDUC_MAX;
7505 : : }
7506 : 46447 : VECT_REDUC_INFO_FN (reduc_info) = reduc_fn;
7507 : :
7508 : 46447 : if (reduction_type != EXTRACT_LAST_REDUCTION
7509 : : && reduc_fn == IFN_LAST
7510 : : && !nunits_out.is_constant ())
7511 : : {
7512 : : if (dump_enabled_p ())
7513 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7514 : : "missing target support for reduction on"
7515 : : " variable-length vectors.\n");
7516 : : return false;
7517 : : }
7518 : :
7519 : : /* For SLP reductions, see if there is a neutral value we can use. */
7520 : 46447 : tree neutral_op = NULL_TREE;
7521 : 46447 : tree initial_value = NULL_TREE;
7522 : 46447 : if (reduc_chain)
7523 : 1364 : initial_value = vect_phi_initial_value (reduc_def_phi);
7524 : 46447 : neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7525 : : orig_code, initial_value);
7526 : :
7527 : 46447 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7528 : : {
7529 : : /* We can't support in-order reductions of code such as this:
7530 : :
7531 : : for (int i = 0; i < n1; ++i)
7532 : : for (int j = 0; j < n2; ++j)
7533 : : l += a[j];
7534 : :
7535 : : since GCC effectively transforms the loop when vectorizing:
7536 : :
7537 : : for (int i = 0; i < n1 / VF; ++i)
7538 : : for (int j = 0; j < n2; ++j)
7539 : : for (int k = 0; k < VF; ++k)
7540 : : l += a[j];
7541 : :
7542 : : which is a reassociation of the original operation. */
7543 : 56 : if (dump_enabled_p ())
7544 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7545 : : "in-order double reduction not supported.\n");
7546 : :
7547 : 56 : return false;
7548 : : }
7549 : :
7550 : 46391 : if (reduction_type == FOLD_LEFT_REDUCTION
7551 : 3984 : && SLP_TREE_LANES (slp_node) > 1
7552 : 119 : && !reduc_chain)
7553 : : {
7554 : : /* We cannot use in-order reductions in this case because there is
7555 : : an implicit reassociation of the operations involved. */
7556 : 57 : if (dump_enabled_p ())
7557 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7558 : : "in-order unchained SLP reductions not supported.\n");
7559 : 57 : return false;
7560 : : }
7561 : :
7562 : : /* For double reductions, and for SLP reductions with a neutral value,
7563 : : we construct a variable-length initial vector by loading a vector
7564 : : full of the neutral value and then shift-and-inserting the start
7565 : : values into the low-numbered elements. */
7566 : 46334 : if ((double_reduc || neutral_op)
7567 : : && !nunits_out.is_constant ()
7568 : : && (SLP_TREE_LANES (slp_node) != 1 && !reduc_chain)
7569 : : && (!neutral_op
7570 : : || !operand_equal_p (neutral_op,
7571 : : vect_phi_initial_value (reduc_def_phi)))
7572 : : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7573 : : vectype_out, OPTIMIZE_FOR_SPEED))
7574 : : {
7575 : : if (dump_enabled_p ())
7576 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7577 : : "reduction on variable-length vectors requires"
7578 : : " target support for a vector-shift-and-insert"
7579 : : " operation.\n");
7580 : : return false;
7581 : : }
7582 : :
7583 : : /* Check extra constraints for variable-length unchained SLP reductions. */
7584 : 46334 : if (!reduc_chain
7585 : : && !nunits_out.is_constant ())
7586 : : {
7587 : : /* We checked above that we could build the initial vector when
7588 : : there's a neutral element value. Check here for the case in
7589 : : which each SLP statement has its own initial value and in which
7590 : : that value needs to be repeated for every instance of the
7591 : : statement within the initial vector. */
7592 : : unsigned int group_size = SLP_TREE_LANES (slp_node);
7593 : : if (!neutral_op
7594 : : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7595 : : TREE_TYPE (vectype_out)))
7596 : : {
7597 : : if (dump_enabled_p ())
7598 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7599 : : "unsupported form of SLP reduction for"
7600 : : " variable-length vectors: cannot build"
7601 : : " initial vector.\n");
7602 : : return false;
7603 : : }
7604 : : /* The epilogue code relies on the number of elements being a multiple
7605 : : of the group size. The duplicate-and-interleave approach to setting
7606 : : up the initial vector does too. */
7607 : : if (!multiple_p (nunits_out, group_size))
7608 : : {
7609 : : if (dump_enabled_p ())
7610 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7611 : : "unsupported form of SLP reduction for"
7612 : : " variable-length vectors: the vector size"
7613 : : " is not a multiple of the number of results.\n");
7614 : : return false;
7615 : : }
7616 : : }
7617 : :
7618 : 46334 : if (reduction_type == COND_REDUCTION)
7619 : : {
7620 : 369 : widest_int ni;
7621 : :
7622 : 369 : if (! max_loop_iterations (loop, &ni))
7623 : : {
7624 : 0 : if (dump_enabled_p ())
7625 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
7626 : : "loop count not known, cannot create cond "
7627 : : "reduction.\n");
7628 : 0 : return false;
7629 : : }
7630 : : /* Convert backedges to iterations. */
7631 : 369 : ni += 1;
7632 : :
7633 : : /* The additional index will be the same type as the condition. Check
7634 : : that the loop can fit into this less one (because we'll use up the
7635 : : zero slot for when there are no matches). */
7636 : 369 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7637 : 369 : if (wi::geu_p (ni, wi::to_widest (max_index)))
7638 : : {
7639 : 90 : if (dump_enabled_p ())
7640 : 54 : dump_printf_loc (MSG_NOTE, vect_location,
7641 : : "loop size is greater than data size.\n");
7642 : 90 : return false;
7643 : : }
7644 : 369 : }
7645 : :
7646 : : /* In case the vectorization factor (VF) is bigger than the number
7647 : : of elements that we can fit in a vectype (nunits), we have to generate
7648 : : more than one vector stmt - i.e - we need to "unroll" the
7649 : : vector stmt by a factor VF/nunits. For more details see documentation
7650 : : in vectorizable_operation. */
7651 : :
7652 : : /* If the reduction is used in an outer loop we need to generate
7653 : : VF intermediate results, like so (e.g. for ncopies=2):
7654 : : r0 = phi (init, r0)
7655 : : r1 = phi (init, r1)
7656 : : r0 = x0 + r0;
7657 : : r1 = x1 + r1;
7658 : : (i.e. we generate VF results in 2 registers).
7659 : : In this case we have a separate def-use cycle for each copy, and therefore
7660 : : for each copy we get the vector def for the reduction variable from the
7661 : : respective phi node created for this copy.
7662 : :
7663 : : Otherwise (the reduction is unused in the loop nest), we can combine
7664 : : together intermediate results, like so (e.g. for ncopies=2):
7665 : : r = phi (init, r)
7666 : : r = x0 + r;
7667 : : r = x1 + r;
7668 : : (i.e. we generate VF/2 results in a single register).
7669 : : In this case for each copy we get the vector def for the reduction variable
7670 : : from the vectorized reduction operation generated in the previous iteration.
7671 : :
7672 : : This only works when we see both the reduction PHI and its only consumer
7673 : : in vectorizable_reduction and there are no intermediate stmts
7674 : : participating. When unrolling we want each unrolled iteration to have its
7675 : : own reduction accumulator since one of the main goals of unrolling a
7676 : : reduction is to reduce the aggregate loop-carried latency. */
7677 : 46244 : if (ncopies > 1
7678 : 46244 : && !reduc_chain
7679 : 5374 : && SLP_TREE_LANES (slp_node) == 1
7680 : 5231 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7681 : 5212 : && reduc_chain_length == 1
7682 : 4911 : && loop_vinfo->suggested_unroll_factor == 1)
7683 : 46244 : single_defuse_cycle = true;
7684 : :
7685 : 46244 : if (single_defuse_cycle && !lane_reducing)
7686 : : {
7687 : 4342 : gcc_assert (op.code != COND_EXPR);
7688 : :
7689 : : /* 4. check support for the operation in the loop
7690 : :
7691 : : This isn't necessary for the lane reduction codes, since they
7692 : : can only be produced by pattern matching, and it's up to the
7693 : : pattern matcher to test for support. The main reason for
7694 : : specifically skipping this step is to avoid rechecking whether
7695 : : mixed-sign dot-products can be implemented using signed
7696 : : dot-products. */
7697 : 4342 : machine_mode vec_mode = TYPE_MODE (vectype_in);
7698 : 4342 : if (!directly_supported_p (op.code, vectype_in, optab_vector))
7699 : : {
7700 : 697 : if (dump_enabled_p ())
7701 : 10 : dump_printf (MSG_NOTE, "op not supported by target.\n");
7702 : 1394 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7703 : 697 : || !vect_can_vectorize_without_simd_p (op.code))
7704 : : single_defuse_cycle = false;
7705 : : else
7706 : 5 : if (dump_enabled_p ())
7707 : 0 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7708 : : }
7709 : :
7710 : 4342 : if (vect_emulated_vector_p (vectype_in)
7711 : 4342 : && !vect_can_vectorize_without_simd_p (op.code))
7712 : : {
7713 : 0 : if (dump_enabled_p ())
7714 : 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
7715 : 0 : return false;
7716 : : }
7717 : : }
7718 : 46244 : if (dump_enabled_p () && single_defuse_cycle)
7719 : 630 : dump_printf_loc (MSG_NOTE, vect_location,
7720 : : "using single def-use cycle for reduction by reducing "
7721 : : "multiple vectors to one in the loop body\n");
7722 : 46244 : VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7723 : :
7724 : : /* For lane-reducing operation, the below processing related to single
7725 : : defuse-cycle will be done in its own vectorizable function. One more
7726 : : thing to note is that the operation must not be involved in fold-left
7727 : : reduction. */
7728 : 46244 : single_defuse_cycle &= !lane_reducing;
7729 : :
7730 : 46244 : if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
7731 : 24428 : for (i = 0; i < (int) op.num_ops; i++)
7732 : 16908 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7733 : : {
7734 : 0 : if (dump_enabled_p ())
7735 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7736 : : "incompatible vector types for invariants\n");
7737 : 0 : return false;
7738 : : }
7739 : :
7740 : 46244 : vect_model_reduction_cost (loop_vinfo, slp_for_stmt_info, reduc_fn,
7741 : : reduction_type, ncopies, cost_vec);
7742 : : /* Cost the reduction op inside the loop if transformed via
7743 : : vect_transform_reduction for non-lane-reducing operation. Otherwise
7744 : : this is costed by the separate vectorizable_* routines. */
7745 : 46244 : if (single_defuse_cycle)
7746 : 3650 : record_stmt_cost (cost_vec, ncopies, vector_stmt,
7747 : : slp_for_stmt_info, 0, vect_body);
7748 : :
7749 : 46244 : if (dump_enabled_p ()
7750 : 46244 : && reduction_type == FOLD_LEFT_REDUCTION)
7751 : 212 : dump_printf_loc (MSG_NOTE, vect_location,
7752 : : "using an in-order (fold-left) reduction.\n");
7753 : 46244 : SLP_TREE_TYPE (slp_node) = cycle_phi_info_type;
7754 : :
7755 : : /* All but single defuse-cycle optimized and fold-left reductions go
7756 : : through their own vectorizable_* routines. */
7757 : 46244 : stmt_vec_info tem
7758 : 46244 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (slp_node_instance));
7759 : 46244 : if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
7760 : 38724 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7761 : : else
7762 : : {
7763 : 7520 : STMT_VINFO_DEF_TYPE (tem) = vect_reduction_def;
7764 : 7520 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7765 : 3349 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7766 : : slp_node, op.code, op.type,
7767 : : vectype_in);
7768 : : }
7769 : : return true;
7770 : : }
7771 : :
7772 : : /* STMT_INFO is a dot-product reduction whose multiplication operands
7773 : : have different signs. Emit a sequence to emulate the operation
7774 : : using a series of signed DOT_PROD_EXPRs and return the last
7775 : : statement generated. VEC_DEST is the result of the vector operation
7776 : : and VOP lists its inputs. */
7777 : :
7778 : : static gassign *
7779 : 4 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7780 : : gimple_stmt_iterator *gsi, tree vec_dest,
7781 : : tree vop[3])
7782 : : {
7783 : 4 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7784 : 4 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7785 : 4 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
7786 : 4 : gimple *new_stmt;
7787 : :
7788 : : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7789 : 4 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7790 : 0 : std::swap (vop[0], vop[1]);
7791 : :
7792 : : /* Convert all inputs to signed types. */
7793 : 16 : for (int i = 0; i < 3; ++i)
7794 : 12 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7795 : : {
7796 : 4 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7797 : 4 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7798 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7799 : 4 : vop[i] = tmp;
7800 : : }
7801 : :
7802 : : /* In the comments below we assume 8-bit inputs for simplicity,
7803 : : but the approach works for any full integer type. */
7804 : :
7805 : : /* Create a vector of -128. */
7806 : 4 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7807 : 4 : tree min_narrow = build_vector_from_val (narrow_vectype,
7808 : : min_narrow_elttype);
7809 : :
7810 : : /* Create a vector of 64. */
7811 : 4 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7812 : 4 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7813 : 4 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7814 : :
7815 : : /* Emit: SUB_RES = VOP[0] - 128. */
7816 : 4 : tree sub_res = make_ssa_name (narrow_vectype);
7817 : 4 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7818 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7819 : :
7820 : : /* Emit:
7821 : :
7822 : : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7823 : : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7824 : : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7825 : :
7826 : : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7827 : : Doing the two 64 * y steps first allows more time to compute x. */
7828 : 4 : tree stage1 = make_ssa_name (wide_vectype);
7829 : 4 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7830 : : vop[1], half_narrow, vop[2]);
7831 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7832 : :
7833 : 4 : tree stage2 = make_ssa_name (wide_vectype);
7834 : 4 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7835 : : vop[1], half_narrow, stage1);
7836 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7837 : :
7838 : 4 : tree stage3 = make_ssa_name (wide_vectype);
7839 : 4 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7840 : : sub_res, vop[1], stage2);
7841 : 4 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7842 : :
7843 : : /* Convert STAGE3 to the reduction type. */
7844 : 4 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7845 : 4 : }
7846 : :
7847 : : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7848 : : value. */
7849 : :
7850 : : bool
7851 : 2552 : vect_transform_reduction (loop_vec_info loop_vinfo,
7852 : : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7853 : : slp_tree slp_node)
7854 : : {
7855 : 2552 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
7856 : 2552 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7857 : 2552 : unsigned vec_num;
7858 : :
7859 : 2552 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
7860 : :
7861 : 2552 : if (nested_in_vect_loop_p (loop, stmt_info))
7862 : : {
7863 : 0 : loop = loop->inner;
7864 : 0 : gcc_assert (VECT_REDUC_INFO_DEF_TYPE (reduc_info)
7865 : : == vect_double_reduction_def);
7866 : : }
7867 : :
7868 : 2552 : gimple_match_op op;
7869 : 2552 : if (!gimple_extract_op (stmt_info->stmt, &op))
7870 : 0 : gcc_unreachable ();
7871 : :
7872 : : /* All uses but the last are expected to be defined in the loop.
7873 : : The last use is the reduction variable. In case of nested cycle this
7874 : : assumption is not true: we use reduc_index to record the index of the
7875 : : reduction variable. */
7876 : 2552 : int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
7877 : 2552 : tree vectype_in = SLP_TREE_VECTYPE (slp_node);
7878 : 2552 : if (lane_reducing_op_p (op.code))
7879 : 252 : vectype_in = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0]);
7880 : :
7881 : 2552 : vec_num = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
7882 : :
7883 : 2552 : code_helper code = canonicalize_code (op.code, op.type);
7884 : 2552 : internal_fn cond_fn
7885 : 468 : = ((code.is_internal_fn ()
7886 : 468 : && internal_fn_mask_index ((internal_fn)code) != -1)
7887 : 2552 : ? (internal_fn)code : get_conditional_internal_fn (code, op.type));
7888 : :
7889 : 2552 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7890 : 2552 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7891 : 2552 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7892 : :
7893 : : /* Transform. */
7894 : 2552 : tree new_temp = NULL_TREE;
7895 : 17864 : auto_vec<tree> vec_oprnds[3];
7896 : :
7897 : 2552 : if (dump_enabled_p ())
7898 : 723 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7899 : :
7900 : : /* A binary COND_OP reduction must have the same definition and else
7901 : : value. */
7902 : 3020 : bool cond_fn_p = code.is_internal_fn ()
7903 : 468 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
7904 : 468 : if (cond_fn_p)
7905 : : {
7906 : 468 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
7907 : : || code == IFN_COND_MUL || code == IFN_COND_AND
7908 : : || code == IFN_COND_IOR || code == IFN_COND_XOR
7909 : : || code == IFN_COND_MIN || code == IFN_COND_MAX);
7910 : 468 : gcc_assert (op.num_ops == 4
7911 : : && (op.ops[reduc_index]
7912 : : == op.ops[internal_fn_else_index ((internal_fn) code)]));
7913 : : }
7914 : :
7915 : 2552 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7916 : :
7917 : 2552 : vect_reduction_type reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
7918 : 2552 : if (reduction_type == FOLD_LEFT_REDUCTION)
7919 : : {
7920 : 830 : internal_fn reduc_fn = VECT_REDUC_INFO_FN (reduc_info);
7921 : 830 : gcc_assert (code.is_tree_code () || cond_fn_p);
7922 : 830 : return vectorize_fold_left_reduction
7923 : 830 : (loop_vinfo, stmt_info, gsi, slp_node,
7924 : 830 : code, reduc_fn, op.num_ops, vectype_in,
7925 : 830 : reduc_index, masks, lens);
7926 : : }
7927 : :
7928 : 1722 : bool single_defuse_cycle = VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info);
7929 : 1722 : bool lane_reducing = lane_reducing_op_p (code);
7930 : 1470 : gcc_assert (single_defuse_cycle || lane_reducing);
7931 : :
7932 : 1722 : if (lane_reducing)
7933 : : {
7934 : : /* The last operand of lane-reducing op is for reduction. */
7935 : 252 : gcc_assert (reduc_index == (int) op.num_ops - 1);
7936 : : }
7937 : :
7938 : : /* Create the destination vector */
7939 : 1722 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
7940 : 1722 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7941 : :
7942 : : /* Get NCOPIES vector definitions for all operands except the reduction
7943 : : definition. */
7944 : 1722 : if (!cond_fn_p)
7945 : : {
7946 : 1269 : gcc_assert (reduc_index >= 0 && reduc_index <= 2);
7947 : 2110 : vect_get_vec_defs (loop_vinfo, slp_node,
7948 : 1269 : single_defuse_cycle && reduc_index == 0
7949 : : ? NULL_TREE : op.ops[0], &vec_oprnds[0],
7950 : 1269 : single_defuse_cycle && reduc_index == 1
7951 : : ? NULL_TREE : op.ops[1], &vec_oprnds[1],
7952 : 1269 : op.num_ops == 3
7953 : 252 : && !(single_defuse_cycle && reduc_index == 2)
7954 : : ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
7955 : : }
7956 : : else
7957 : : {
7958 : : /* For a conditional operation pass the truth type as mask
7959 : : vectype. */
7960 : 453 : gcc_assert (single_defuse_cycle
7961 : : && (reduc_index == 1 || reduc_index == 2));
7962 : 453 : vect_get_vec_defs (loop_vinfo, slp_node, op.ops[0],
7963 : : &vec_oprnds[0],
7964 : : reduc_index == 1 ? NULL_TREE : op.ops[1],
7965 : : &vec_oprnds[1],
7966 : : reduc_index == 2 ? NULL_TREE : op.ops[2],
7967 : : &vec_oprnds[2]);
7968 : : }
7969 : :
7970 : : /* For single def-use cycles get one copy of the vectorized reduction
7971 : : definition. */
7972 : 1722 : if (single_defuse_cycle)
7973 : : {
7974 : 1637 : vect_get_vec_defs (loop_vinfo, slp_node,
7975 : : reduc_index == 0 ? op.ops[0] : NULL_TREE,
7976 : : &vec_oprnds[0],
7977 : : reduc_index == 1 ? op.ops[1] : NULL_TREE,
7978 : : &vec_oprnds[1],
7979 : : reduc_index == 2 ? op.ops[2] : NULL_TREE,
7980 : : &vec_oprnds[2]);
7981 : : }
7982 : 85 : else if (lane_reducing)
7983 : : {
7984 : : /* For normal reduction, consistency between vectorized def/use is
7985 : : naturally ensured when mapping from scalar statement. But if lane-
7986 : : reducing op is involved in reduction, thing would become somewhat
7987 : : complicated in that the op's result and operand for accumulation are
7988 : : limited to less lanes than other operands, which certainly causes
7989 : : def/use mismatch on adjacent statements around the op if do not have
7990 : : any kind of specific adjustment. One approach is to refit lane-
7991 : : reducing op in the way of introducing new trivial pass-through copies
7992 : : to fix possible def/use gap, so as to make it behave like a normal op.
7993 : : And vector reduction PHIs are always generated to the full extent, no
7994 : : matter lane-reducing op exists or not. If some copies or PHIs are
7995 : : actually superfluous, they would be cleaned up by passes after
7996 : : vectorization. An example for single-lane slp, lane-reducing ops
7997 : : with mixed input vectypes in a reduction chain, is given as below.
7998 : : Similarly, this handling is applicable for multiple-lane slp as well.
7999 : :
8000 : : int sum = 1;
8001 : : for (i)
8002 : : {
8003 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
8004 : : sum += w[i]; // widen-sum <vector(16) char>
8005 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8006 : : sum += n[i]; // normal <vector(4) int>
8007 : : }
8008 : :
8009 : : The vector size is 128-bit,vectorization factor is 16. Reduction
8010 : : statements would be transformed as:
8011 : :
8012 : : vector<4> int sum_v0 = { 0, 0, 0, 1 };
8013 : : vector<4> int sum_v1 = { 0, 0, 0, 0 };
8014 : : vector<4> int sum_v2 = { 0, 0, 0, 0 };
8015 : : vector<4> int sum_v3 = { 0, 0, 0, 0 };
8016 : :
8017 : : for (i / 16)
8018 : : {
8019 : : sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8020 : : sum_v1 = sum_v1; // copy
8021 : : sum_v2 = sum_v2; // copy
8022 : : sum_v3 = sum_v3; // copy
8023 : :
8024 : : sum_v0 = sum_v0; // copy
8025 : : sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8026 : : sum_v2 = sum_v2; // copy
8027 : : sum_v3 = sum_v3; // copy
8028 : :
8029 : : sum_v0 = sum_v0; // copy
8030 : : sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8031 : : sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8032 : : sum_v3 = sum_v3; // copy
8033 : :
8034 : : sum_v0 += n_v0[i: 0 ~ 3 ];
8035 : : sum_v1 += n_v1[i: 4 ~ 7 ];
8036 : : sum_v2 += n_v2[i: 8 ~ 11];
8037 : : sum_v3 += n_v3[i: 12 ~ 15];
8038 : : }
8039 : :
8040 : : Moreover, for a higher instruction parallelism in final vectorized
8041 : : loop, it is considered to make those effective vector lane-reducing
8042 : : ops be distributed evenly among all def-use cycles. In the above
8043 : : example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8044 : : cycles, instruction dependency among them could be eliminated. */
8045 : 85 : unsigned effec_ncopies = vec_oprnds[0].length ();
8046 : 85 : unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8047 : :
8048 : 85 : gcc_assert (effec_ncopies <= total_ncopies);
8049 : :
8050 : 85 : if (effec_ncopies < total_ncopies)
8051 : : {
8052 : 255 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8053 : : {
8054 : 340 : gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8055 : 170 : vec_oprnds[i].safe_grow_cleared (total_ncopies);
8056 : : }
8057 : : }
8058 : :
8059 : 85 : tree reduc_vectype_in = vectype_in;
8060 : 85 : gcc_assert (reduc_vectype_in);
8061 : :
8062 : 85 : unsigned effec_reduc_ncopies
8063 : 85 : = vect_get_num_copies (loop_vinfo, SLP_TREE_CHILDREN (slp_node)[0]);
8064 : :
8065 : 85 : gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8066 : :
8067 : 85 : if (effec_ncopies < effec_reduc_ncopies)
8068 : : {
8069 : : /* Find suitable def-use cycles to generate vectorized statements
8070 : : into, and reorder operands based on the selection. */
8071 : 0 : unsigned curr_pos = VECT_REDUC_INFO_RESULT_POS (reduc_info);
8072 : 0 : unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8073 : :
8074 : 0 : gcc_assert (curr_pos < effec_reduc_ncopies);
8075 : 0 : VECT_REDUC_INFO_RESULT_POS (reduc_info) = next_pos;
8076 : :
8077 : 0 : if (curr_pos)
8078 : : {
8079 : 0 : unsigned count = effec_reduc_ncopies - effec_ncopies;
8080 : 0 : unsigned start = curr_pos - count;
8081 : :
8082 : 0 : if ((int) start < 0)
8083 : : {
8084 : 0 : count = curr_pos;
8085 : 0 : start = 0;
8086 : : }
8087 : :
8088 : 0 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8089 : : {
8090 : 0 : for (unsigned j = effec_ncopies; j > start; j--)
8091 : : {
8092 : 0 : unsigned k = j - 1;
8093 : 0 : std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8094 : 0 : gcc_assert (!vec_oprnds[i][k]);
8095 : : }
8096 : : }
8097 : : }
8098 : : }
8099 : : }
8100 : :
8101 : 1722 : bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (slp_node);
8102 : 2940 : unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8103 : 1722 : unsigned mask_index = 0;
8104 : :
8105 : 7555 : for (unsigned i = 0; i < num; ++i)
8106 : : {
8107 : 5833 : gimple *new_stmt;
8108 : 5833 : tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8109 : 5833 : if (!vop[0] || !vop[1])
8110 : : {
8111 : 456 : tree reduc_vop = vec_oprnds[reduc_index][i];
8112 : :
8113 : : /* If could not generate an effective vector statement for current
8114 : : portion of reduction operand, insert a trivial copy to simply
8115 : : handle over the operand to other dependent statements. */
8116 : 456 : gcc_assert (reduc_vop);
8117 : :
8118 : 456 : if (TREE_CODE (reduc_vop) == SSA_NAME
8119 : 456 : && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8120 : 456 : new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8121 : : else
8122 : : {
8123 : 0 : new_temp = make_ssa_name (vec_dest);
8124 : 0 : new_stmt = gimple_build_assign (new_temp, reduc_vop);
8125 : 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8126 : : gsi);
8127 : : }
8128 : : }
8129 : 5377 : else if (masked_loop_p && !mask_by_cond_expr)
8130 : : {
8131 : : /* No conditional ifns have been defined for lane-reducing op
8132 : : yet. */
8133 : 16 : gcc_assert (!lane_reducing);
8134 : :
8135 : 16 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8136 : : vec_num, vectype_in,
8137 : : mask_index++);
8138 : 16 : gcall *call;
8139 : 24 : if (code.is_internal_fn () && cond_fn_p)
8140 : : {
8141 : 16 : gcc_assert (op.num_ops >= 3
8142 : : && internal_fn_mask_index (internal_fn (code)) == 0);
8143 : 8 : vop[2] = vec_oprnds[2][i];
8144 : 8 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask),
8145 : : mask, vop[0], gsi);
8146 : 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[1],
8147 : : vop[2], vop[reduc_index]);
8148 : : }
8149 : : else
8150 : : {
8151 : 8 : gcc_assert (code.is_tree_code ());
8152 : 8 : call = gimple_build_call_internal (cond_fn, 4, mask, vop[0],
8153 : : vop[1], vop[reduc_index]);
8154 : : }
8155 : 16 : new_temp = make_ssa_name (vec_dest, call);
8156 : 16 : gimple_call_set_lhs (call, new_temp);
8157 : 16 : gimple_call_set_nothrow (call, true);
8158 : 16 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8159 : 16 : new_stmt = call;
8160 : : }
8161 : : else
8162 : : {
8163 : 5361 : if (op.num_ops >= 3)
8164 : 1747 : vop[2] = vec_oprnds[2][i];
8165 : :
8166 : 5361 : if (masked_loop_p && mask_by_cond_expr)
8167 : : {
8168 : 4 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8169 : : vec_num, vectype_in,
8170 : : mask_index++);
8171 : 4 : build_vect_cond_expr (code, vop, mask, gsi);
8172 : : }
8173 : :
8174 : 5361 : if (emulated_mixed_dot_prod)
8175 : 4 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8176 : : vec_dest, vop);
8177 : :
8178 : 6699 : else if (code.is_internal_fn () && !cond_fn_p)
8179 : 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8180 : : op.num_ops,
8181 : : vop[0], vop[1], vop[2]);
8182 : 6699 : else if (code.is_internal_fn () && cond_fn_p)
8183 : 1342 : new_stmt = gimple_build_call_internal (internal_fn (code),
8184 : : op.num_ops,
8185 : : vop[0], vop[1], vop[2],
8186 : : vop[reduc_index]);
8187 : : else
8188 : 4015 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8189 : : vop[0], vop[1], vop[2]);
8190 : 5361 : new_temp = make_ssa_name (vec_dest, new_stmt);
8191 : 5361 : gimple_set_lhs (new_stmt, new_temp);
8192 : 5361 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8193 : : }
8194 : :
8195 : 5833 : if (single_defuse_cycle && i < num - 1)
8196 : 3512 : vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8197 : : else
8198 : 2321 : slp_node->push_vec_def (new_stmt);
8199 : : }
8200 : :
8201 : : return true;
8202 : 10208 : }
8203 : :
8204 : : /* Transform phase of a cycle PHI. */
8205 : :
8206 : : bool
8207 : 23438 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8208 : : stmt_vec_info stmt_info,
8209 : : slp_tree slp_node, slp_instance slp_node_instance)
8210 : : {
8211 : 23438 : tree vectype_out = SLP_TREE_VECTYPE (slp_node);
8212 : 23438 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8213 : 23438 : int i;
8214 : 23438 : bool nested_cycle = false;
8215 : 23438 : int vec_num;
8216 : :
8217 : 23554 : if (nested_in_vect_loop_p (loop, stmt_info))
8218 : : {
8219 : : loop = loop->inner;
8220 : : nested_cycle = true;
8221 : : }
8222 : :
8223 : 23438 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
8224 : 23438 : if (reduc_info
8225 : 22849 : && (VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8226 : 22849 : || VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION))
8227 : : /* Leave the scalar phi in place. */
8228 : : return true;
8229 : :
8230 : 22019 : if (reduc_info && reduc_info->is_reduc_chain && dump_enabled_p ())
8231 : 109 : dump_printf_loc (MSG_NOTE, vect_location,
8232 : : "vectorizing a reduction chain\n");
8233 : :
8234 : 22608 : vec_num = vect_get_num_copies (loop_vinfo, slp_node);
8235 : :
8236 : : /* Check whether we should use a single PHI node and accumulate
8237 : : vectors to one before the backedge. */
8238 : 22608 : if (reduc_info && VECT_REDUC_INFO_FORCE_SINGLE_CYCLE (reduc_info))
8239 : 22608 : vec_num = 1;
8240 : :
8241 : : /* Create the destination vector */
8242 : 22608 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8243 : 22608 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8244 : : vectype_out);
8245 : :
8246 : : /* Get the loop-entry arguments. */
8247 : 22608 : tree vec_initial_def = NULL_TREE;
8248 : 22608 : auto_vec<tree> vec_initial_defs;
8249 : 22608 : vec_initial_defs.reserve (vec_num);
8250 : : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8251 : : and we can't use zero for induc_val, use initial_def. Similarly
8252 : : for REDUC_MIN and initial_def larger than the base. */
8253 : 22608 : if (reduc_info
8254 : 22019 : && VECT_REDUC_INFO_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8255 : : {
8256 : 62 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8257 : 62 : tree initial_def = vect_phi_initial_value (phi);
8258 : 62 : VECT_REDUC_INFO_INITIAL_VALUES (reduc_info).safe_push (initial_def);
8259 : 62 : tree induc_val = VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info);
8260 : 62 : if (TREE_CODE (initial_def) == INTEGER_CST
8261 : 60 : && !integer_zerop (induc_val)
8262 : 122 : && ((VECT_REDUC_INFO_CODE (reduc_info) == MAX_EXPR
8263 : 42 : && tree_int_cst_lt (initial_def, induc_val))
8264 : 58 : || (VECT_REDUC_INFO_CODE (reduc_info) == MIN_EXPR
8265 : 18 : && tree_int_cst_lt (induc_val, initial_def))))
8266 : : {
8267 : 2 : induc_val = initial_def;
8268 : : /* Communicate we used the initial_def to epilouge
8269 : : generation. */
8270 : 2 : VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8271 : : }
8272 : 62 : vec_initial_defs.quick_push
8273 : 62 : (build_vector_from_val (vectype_out, induc_val));
8274 : 62 : }
8275 : 22546 : else if (nested_cycle)
8276 : : {
8277 : 670 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8278 : 670 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8279 : : &vec_initial_defs);
8280 : : }
8281 : : else
8282 : : {
8283 : 21876 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8284 : 21876 : vec<tree> &initial_values = VECT_REDUC_INFO_INITIAL_VALUES (reduc_info);
8285 : 21876 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8286 : :
8287 : 21876 : unsigned int num_phis = stmts.length ();
8288 : 21876 : if (reduc_info->is_reduc_chain)
8289 : 179 : num_phis = 1;
8290 : 21876 : initial_values.reserve (num_phis);
8291 : 44212 : for (unsigned int i = 0; i < num_phis; ++i)
8292 : : {
8293 : 22336 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8294 : 22336 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8295 : : }
8296 : 21876 : if (vec_num == 1)
8297 : 21297 : vect_find_reusable_accumulator (loop_vinfo, reduc_info, vectype_out);
8298 : 21876 : if (!initial_values.is_empty ())
8299 : : {
8300 : 21661 : tree initial_value
8301 : 43097 : = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8302 : 21661 : code_helper code = VECT_REDUC_INFO_CODE (reduc_info);
8303 : 21661 : tree neutral_op
8304 : 21661 : = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8305 : : code, initial_value);
8306 : : /* Try to simplify the vector initialization by applying an
8307 : : adjustment after the reduction has been performed. This
8308 : : can also break a critical path but on the other hand
8309 : : requires to keep the initial value live across the loop. */
8310 : 21661 : if (neutral_op
8311 : 21574 : && initial_values.length () == 1
8312 : 21369 : && !VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
8313 : 17430 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8314 : 39013 : && !operand_equal_p (neutral_op, initial_values[0]))
8315 : : {
8316 : 12227 : VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT (reduc_info)
8317 : 12227 : = initial_values[0];
8318 : 12227 : initial_values[0] = neutral_op;
8319 : : }
8320 : 43322 : get_initial_defs_for_reduction (loop_vinfo, reduc_info, vectype_out,
8321 : : &vec_initial_defs, vec_num,
8322 : : stmts.length (), neutral_op);
8323 : : }
8324 : : }
8325 : :
8326 : 22608 : if (vec_initial_def)
8327 : : {
8328 : 0 : vec_initial_defs.create (1);
8329 : 0 : vec_initial_defs.quick_push (vec_initial_def);
8330 : : }
8331 : :
8332 : 22608 : if (reduc_info)
8333 : 22019 : if (auto *accumulator = VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info))
8334 : : {
8335 : 4171 : tree def = accumulator->reduc_input;
8336 : 4171 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8337 : : {
8338 : 4168 : unsigned int nreduc;
8339 : 8336 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8340 : 4168 : (TREE_TYPE (def)),
8341 : 4168 : TYPE_VECTOR_SUBPARTS (vectype_out),
8342 : : &nreduc);
8343 : 0 : gcc_assert (res);
8344 : 4168 : gimple_seq stmts = NULL;
8345 : : /* Reduce the single vector to a smaller one. */
8346 : 4168 : if (nreduc != 1)
8347 : : {
8348 : : /* Perform the reduction in the appropriate type. */
8349 : 4168 : tree rvectype = vectype_out;
8350 : 4168 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8351 : 4168 : TREE_TYPE (TREE_TYPE (def))))
8352 : 235 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8353 : : TYPE_VECTOR_SUBPARTS
8354 : 470 : (vectype_out));
8355 : 4168 : def = vect_create_partial_epilog (def, rvectype,
8356 : : VECT_REDUC_INFO_CODE
8357 : : (reduc_info),
8358 : : &stmts);
8359 : : }
8360 : : /* The epilogue loop might use a different vector mode, like
8361 : : VNx2DI vs. V2DI. */
8362 : 4168 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8363 : : {
8364 : 0 : tree reduc_type = build_vector_type_for_mode
8365 : 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8366 : 0 : def = gimple_convert (&stmts, reduc_type, def);
8367 : : }
8368 : : /* Adjust the input so we pick up the partially reduced value
8369 : : for the skip edge in vect_create_epilog_for_reduction. */
8370 : 4168 : accumulator->reduc_input = def;
8371 : : /* And the reduction could be carried out using a different sign. */
8372 : 4168 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8373 : 235 : def = gimple_convert (&stmts, vectype_out, def);
8374 : 4168 : edge e;
8375 : 4168 : if ((e = loop_vinfo->main_loop_edge)
8376 : 4168 : || (e = loop_vinfo->skip_this_loop_edge))
8377 : : {
8378 : : /* While we'd like to insert on the edge this will split
8379 : : blocks and disturb bookkeeping, we also will eventually
8380 : : need this on the skip edge. Rely on sinking to
8381 : : fixup optimal placement and insert in the pred. */
8382 : 3953 : gimple_stmt_iterator gsi = gsi_last_bb (e->src);
8383 : : /* Insert before a cond that eventually skips the
8384 : : epilogue. */
8385 : 3953 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8386 : 3936 : gsi_prev (&gsi);
8387 : 3953 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8388 : : }
8389 : : else
8390 : 215 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8391 : : stmts);
8392 : : }
8393 : 4171 : if (loop_vinfo->main_loop_edge)
8394 : 3956 : vec_initial_defs[0]
8395 : 3956 : = vect_get_main_loop_result (loop_vinfo, def,
8396 : 3956 : vec_initial_defs[0]);
8397 : : else
8398 : 215 : vec_initial_defs.safe_push (def);
8399 : : }
8400 : :
8401 : : /* Generate the reduction PHIs upfront. */
8402 : 46988 : for (i = 0; i < vec_num; i++)
8403 : : {
8404 : 24380 : tree vec_init_def = vec_initial_defs[i];
8405 : : /* Create the reduction-phi that defines the reduction
8406 : : operand. */
8407 : 24380 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
8408 : 24380 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8409 : : UNKNOWN_LOCATION);
8410 : :
8411 : : /* The loop-latch arg is set in epilogue processing. */
8412 : :
8413 : 24380 : slp_node->push_vec_def (new_phi);
8414 : : }
8415 : :
8416 : 22608 : return true;
8417 : 22608 : }
8418 : :
8419 : : /* Vectorizes LC PHIs. */
8420 : :
8421 : : bool
8422 : 161325 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
8423 : : stmt_vec_info stmt_info,
8424 : : slp_tree slp_node)
8425 : : {
8426 : 161325 : if (!loop_vinfo
8427 : 161325 : || !is_a <gphi *> (stmt_info->stmt)
8428 : 192105 : || gimple_phi_num_args (stmt_info->stmt) != 1)
8429 : : return false;
8430 : :
8431 : 702 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8432 : 0 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8433 : : return false;
8434 : :
8435 : : /* Deal with copies from externs or constants that disguise as
8436 : : loop-closed PHI nodes (PR97886). */
8437 : 702 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8438 : : SLP_TREE_VECTYPE (slp_node)))
8439 : : {
8440 : 0 : if (dump_enabled_p ())
8441 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8442 : : "incompatible vector types for invariants\n");
8443 : 0 : return false;
8444 : : }
8445 : :
8446 : : /* ??? This can happen with data vs. mask uses of boolean. */
8447 : 702 : if (!useless_type_conversion_p (SLP_TREE_VECTYPE (slp_node),
8448 : 702 : SLP_TREE_VECTYPE
8449 : : (SLP_TREE_CHILDREN (slp_node)[0])))
8450 : : {
8451 : 0 : if (dump_enabled_p ())
8452 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8453 : : "missed mask promotion\n");
8454 : 0 : return false;
8455 : : }
8456 : :
8457 : 702 : SLP_TREE_TYPE (slp_node) = lc_phi_info_type;
8458 : 702 : return true;
8459 : : }
8460 : :
8461 : : bool
8462 : 447 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
8463 : : stmt_vec_info stmt_info,
8464 : : slp_tree slp_node)
8465 : : {
8466 : :
8467 : 447 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8468 : 447 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8469 : 447 : basic_block bb = gimple_bb (stmt_info->stmt);
8470 : 447 : edge e = single_pred_edge (bb);
8471 : 447 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8472 : 447 : auto_vec<tree> vec_oprnds;
8473 : 894 : vect_get_vec_defs (loop_vinfo, slp_node,
8474 : 447 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8475 : 1001 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
8476 : : {
8477 : : /* Create the vectorized LC PHI node. */
8478 : 554 : gphi *new_phi = create_phi_node (vec_dest, bb);
8479 : 554 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8480 : 554 : slp_node->push_vec_def (new_phi);
8481 : : }
8482 : :
8483 : 447 : return true;
8484 : 447 : }
8485 : :
8486 : : /* Vectorizes PHIs. */
8487 : :
8488 : : bool
8489 : 139397 : vectorizable_phi (bb_vec_info vinfo,
8490 : : stmt_vec_info stmt_info,
8491 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8492 : : {
8493 : 139397 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8494 : : return false;
8495 : :
8496 : 68055 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8497 : : return false;
8498 : :
8499 : 68055 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8500 : :
8501 : 68055 : if (cost_vec) /* transformation not required. */
8502 : : {
8503 : : slp_tree child;
8504 : : unsigned i;
8505 : 184914 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8506 : 130364 : if (!child)
8507 : : {
8508 : 0 : if (dump_enabled_p ())
8509 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8510 : : "PHI node with unvectorized backedge def\n");
8511 : 0 : return false;
8512 : : }
8513 : 130364 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8514 : : {
8515 : 18 : if (dump_enabled_p ())
8516 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8517 : : "incompatible vector types for invariants\n");
8518 : 18 : return false;
8519 : : }
8520 : 130346 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8521 : 130346 : && !useless_type_conversion_p (vectype,
8522 : : SLP_TREE_VECTYPE (child)))
8523 : : {
8524 : : /* With bools we can have mask and non-mask precision vectors
8525 : : or different non-mask precisions. while pattern recog is
8526 : : supposed to guarantee consistency here bugs in it can cause
8527 : : mismatches (PR103489 and PR103800 for example).
8528 : : Deal with them here instead of ICEing later. */
8529 : 18 : if (dump_enabled_p ())
8530 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8531 : : "incompatible vector type setup from "
8532 : : "bool pattern detection\n");
8533 : 18 : return false;
8534 : : }
8535 : :
8536 : : /* For single-argument PHIs assume coalescing which means zero cost
8537 : : for the scalar and the vector PHIs. This avoids artificially
8538 : : favoring the vector path (but may pessimize it in some cases). */
8539 : 54550 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8540 : 49607 : record_stmt_cost (cost_vec, vect_get_num_copies (vinfo, slp_node),
8541 : : vector_stmt, slp_node, vectype, 0, vect_body);
8542 : 54550 : SLP_TREE_TYPE (slp_node) = phi_info_type;
8543 : 54550 : return true;
8544 : : }
8545 : :
8546 : 13469 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8547 : 13469 : basic_block bb = gimple_bb (stmt_info->stmt);
8548 : 13469 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8549 : 13469 : auto_vec<gphi *> new_phis;
8550 : 48570 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8551 : : {
8552 : 35101 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8553 : :
8554 : : /* Skip not yet vectorized defs. */
8555 : 35471 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8556 : 35101 : && SLP_TREE_VEC_DEFS (child).is_empty ())
8557 : 370 : continue;
8558 : :
8559 : 34731 : auto_vec<tree> vec_oprnds;
8560 : 34731 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8561 : 34731 : if (!new_phis.exists ())
8562 : : {
8563 : 13469 : new_phis.create (vec_oprnds.length ());
8564 : 28483 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8565 : : {
8566 : : /* Create the vectorized LC PHI node. */
8567 : 15014 : new_phis.quick_push (create_phi_node (vec_dest, bb));
8568 : 15014 : slp_node->push_vec_def (new_phis[j]);
8569 : : }
8570 : : }
8571 : 34731 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8572 : 75134 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
8573 : 40403 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8574 : 34731 : }
8575 : : /* We should have at least one already vectorized child. */
8576 : 13469 : gcc_assert (new_phis.exists ());
8577 : :
8578 : 13469 : return true;
8579 : 13469 : }
8580 : :
8581 : : /* Vectorizes first order recurrences. An overview of the transformation
8582 : : is described below. Suppose we have the following loop.
8583 : :
8584 : : int t = 0;
8585 : : for (int i = 0; i < n; ++i)
8586 : : {
8587 : : b[i] = a[i] - t;
8588 : : t = a[i];
8589 : : }
8590 : :
8591 : : There is a first-order recurrence on 'a'. For this loop, the scalar IR
8592 : : looks (simplified) like:
8593 : :
8594 : : scalar.preheader:
8595 : : init = 0;
8596 : :
8597 : : scalar.body:
8598 : : i = PHI <0(scalar.preheader), i+1(scalar.body)>
8599 : : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8600 : : _1 = a[i]
8601 : : b[i] = _1 - _2
8602 : : if (i < n) goto scalar.body
8603 : :
8604 : : In this example, _2 is a recurrence because it's value depends on the
8605 : : previous iteration. We vectorize this as (VF = 4)
8606 : :
8607 : : vector.preheader:
8608 : : vect_init = vect_cst(..., ..., ..., 0)
8609 : :
8610 : : vector.body
8611 : : i = PHI <0(vector.preheader), i+4(vector.body)>
8612 : : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8613 : : vect_2 = a[i, i+1, i+2, i+3];
8614 : : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8615 : : b[i, i+1, i+2, i+3] = vect_2 - vect_3
8616 : : if (..) goto vector.body
8617 : :
8618 : : In this function, vectorizable_recurr, we code generate both the
8619 : : vector PHI node and the permute since those together compute the
8620 : : vectorized value of the scalar PHI. We do not yet have the
8621 : : backedge value to fill in there nor into the vec_perm. Those
8622 : : are filled in vect_schedule_scc.
8623 : :
8624 : : TODO: Since the scalar loop does not have a use of the recurrence
8625 : : outside of the loop the natural way to implement peeling via
8626 : : vectorizing the live value doesn't work. For now peeling of loops
8627 : : with a recurrence is not implemented. For SLP the supported cases
8628 : : are restricted to those requiring a single vector recurrence PHI. */
8629 : :
8630 : : bool
8631 : 160663 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8632 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8633 : : {
8634 : 160663 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8635 : : return false;
8636 : :
8637 : 30118 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
8638 : :
8639 : : /* So far we only support first-order recurrence auto-vectorization. */
8640 : 30118 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8641 : : return false;
8642 : :
8643 : 408 : tree vectype = SLP_TREE_VECTYPE (slp_node);
8644 : 408 : unsigned ncopies = vect_get_num_copies (loop_vinfo, slp_node);
8645 : 408 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8646 : 408 : unsigned dist = SLP_TREE_LANES (slp_node);
8647 : : /* We need to be able to make progress with a single vector. */
8648 : 408 : if (maybe_gt (dist * 2, nunits))
8649 : : {
8650 : 0 : if (dump_enabled_p ())
8651 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8652 : : "first order recurrence exceeds half of "
8653 : : "a vector\n");
8654 : 0 : return false;
8655 : : }
8656 : :
8657 : : /* We need to be able to build a { ..., a, b } init vector with
8658 : : dist number of distinct trailing values. Always possible
8659 : : when dist == 1 or when nunits is constant or when the initializations
8660 : : are uniform. */
8661 : 408 : tree uniform_initval = NULL_TREE;
8662 : 408 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8663 : 1656 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8664 : : {
8665 : 444 : gphi *phi = as_a <gphi *> (s->stmt);
8666 : 444 : if (! uniform_initval)
8667 : 408 : uniform_initval = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8668 : 36 : else if (! operand_equal_p (uniform_initval,
8669 : 36 : PHI_ARG_DEF_FROM_EDGE (phi, pe)))
8670 : : {
8671 : : uniform_initval = NULL_TREE;
8672 : : break;
8673 : : }
8674 : : }
8675 : 408 : if (!uniform_initval && !nunits.is_constant ())
8676 : : {
8677 : : if (dump_enabled_p ())
8678 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8679 : : "cannot build initialization vector for "
8680 : : "first order recurrence\n");
8681 : : return false;
8682 : : }
8683 : :
8684 : : /* First-order recurrence autovectorization needs to handle permutation
8685 : : with indices = [nunits-1, nunits, nunits+1, ...]. */
8686 : 408 : vec_perm_builder sel (nunits, 1, 3);
8687 : 1632 : for (int i = 0; i < 3; ++i)
8688 : 1224 : sel.quick_push (nunits - dist + i);
8689 : 408 : vec_perm_indices indices (sel, 2, nunits);
8690 : :
8691 : 408 : if (cost_vec) /* transformation not required. */
8692 : : {
8693 : 368 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8694 : : indices))
8695 : : return false;
8696 : :
8697 : : /* We eventually need to set a vector type on invariant
8698 : : arguments. */
8699 : : unsigned j;
8700 : : slp_tree child;
8701 : 768 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8702 : 512 : if (!vect_maybe_update_slp_op_vectype (child, vectype))
8703 : : {
8704 : 0 : if (dump_enabled_p ())
8705 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8706 : : "incompatible vector types for "
8707 : : "invariants\n");
8708 : 0 : return false;
8709 : : }
8710 : :
8711 : : /* Verify we have set up compatible types. */
8712 : 256 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8713 : 256 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
8714 : 256 : tree latch_vectype = SLP_TREE_VECTYPE (latch_def);
8715 : 256 : if (!types_compatible_p (latch_vectype, vectype))
8716 : : return false;
8717 : :
8718 : : /* The recurrence costs the initialization vector and one permute
8719 : : for each copy. With SLP the prologue value is explicitly
8720 : : represented and costed separately. */
8721 : 256 : unsigned prologue_cost = 0;
8722 : 256 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8723 : : slp_node, 0, vect_body);
8724 : 256 : if (dump_enabled_p ())
8725 : 48 : dump_printf_loc (MSG_NOTE, vect_location,
8726 : : "vectorizable_recurr: inside_cost = %d, "
8727 : : "prologue_cost = %d .\n", inside_cost,
8728 : : prologue_cost);
8729 : :
8730 : 256 : SLP_TREE_TYPE (slp_node) = recurr_info_type;
8731 : 256 : return true;
8732 : : }
8733 : :
8734 : 40 : tree vec_init;
8735 : 40 : if (! uniform_initval)
8736 : : {
8737 : 6 : vec<constructor_elt, va_gc> *v = NULL;
8738 : 6 : vec_alloc (v, nunits.to_constant ());
8739 : 33 : for (unsigned i = 0; i < nunits.to_constant () - dist; ++i)
8740 : 27 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
8741 : : build_zero_cst (TREE_TYPE (vectype)));
8742 : 39 : for (stmt_vec_info s : SLP_TREE_SCALAR_STMTS (slp_node))
8743 : : {
8744 : 21 : gphi *phi = as_a <gphi *> (s->stmt);
8745 : 21 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8746 : 21 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
8747 : 21 : TREE_TYPE (preheader)))
8748 : : {
8749 : 0 : gimple_seq stmts = NULL;
8750 : 0 : preheader = gimple_convert (&stmts,
8751 : 0 : TREE_TYPE (vectype), preheader);
8752 : 0 : gsi_insert_seq_on_edge_immediate (pe, stmts);
8753 : : }
8754 : 21 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, preheader);
8755 : : }
8756 : 6 : vec_init = build_constructor (vectype, v);
8757 : : }
8758 : : else
8759 : : vec_init = uniform_initval;
8760 : 40 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8761 : :
8762 : : /* Create the vectorized first-order PHI node. */
8763 : 40 : tree vec_dest = vect_get_new_vect_var (vectype,
8764 : : vect_simple_var, "vec_recur_");
8765 : 40 : basic_block bb = gimple_bb (phi);
8766 : 40 : gphi *new_phi = create_phi_node (vec_dest, bb);
8767 : 40 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8768 : :
8769 : : /* Insert shuffles the first-order recurrence autovectorization.
8770 : : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8771 : 40 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
8772 : :
8773 : : /* Insert the required permute after the latch definition. The
8774 : : second and later operands are tentative and will be updated when we have
8775 : : vectorized the latch definition. */
8776 : 40 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8777 : 40 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8778 : 40 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8779 : 40 : gsi_next (&gsi2);
8780 : :
8781 : 117 : for (unsigned i = 0; i < ncopies; ++i)
8782 : : {
8783 : 77 : vec_dest = make_ssa_name (vectype);
8784 : 77 : gassign *vperm
8785 : 117 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8786 : 40 : i == 0 ? gimple_phi_result (new_phi) : NULL,
8787 : : NULL, perm);
8788 : 77 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8789 : :
8790 : 77 : slp_node->push_vec_def (vperm);
8791 : : }
8792 : :
8793 : : return true;
8794 : 408 : }
8795 : :
8796 : : /* Return true if VECTYPE represents a vector that requires lowering
8797 : : by the vector lowering pass. */
8798 : :
8799 : : bool
8800 : 630595 : vect_emulated_vector_p (tree vectype)
8801 : : {
8802 : 1261190 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8803 : 633294 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8804 : 2681 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8805 : : }
8806 : :
8807 : : /* Return true if we can emulate CODE on an integer mode representation
8808 : : of a vector. */
8809 : :
8810 : : bool
8811 : 10778 : vect_can_vectorize_without_simd_p (tree_code code)
8812 : : {
8813 : 10778 : switch (code)
8814 : : {
8815 : : case PLUS_EXPR:
8816 : : case MINUS_EXPR:
8817 : : case NEGATE_EXPR:
8818 : : case BIT_AND_EXPR:
8819 : : case BIT_IOR_EXPR:
8820 : : case BIT_XOR_EXPR:
8821 : : case BIT_NOT_EXPR:
8822 : : return true;
8823 : :
8824 : 9971 : default:
8825 : 9971 : return false;
8826 : : }
8827 : : }
8828 : :
8829 : : /* Likewise, but taking a code_helper. */
8830 : :
8831 : : bool
8832 : 151 : vect_can_vectorize_without_simd_p (code_helper code)
8833 : : {
8834 : 151 : return (code.is_tree_code ()
8835 : 151 : && vect_can_vectorize_without_simd_p (tree_code (code)));
8836 : : }
8837 : :
8838 : : /* Create vector init for vectorized iv. */
8839 : : static tree
8840 : 916 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8841 : : tree step_expr, poly_uint64 nunits,
8842 : : tree vectype,
8843 : : enum vect_induction_op_type induction_type)
8844 : : {
8845 : 916 : unsigned HOST_WIDE_INT const_nunits;
8846 : 916 : tree vec_shift, vec_init, new_name;
8847 : 916 : unsigned i;
8848 : 916 : tree itype = TREE_TYPE (vectype);
8849 : :
8850 : : /* iv_loop is the loop to be vectorized. Create:
8851 : : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8852 : 916 : new_name = gimple_convert (stmts, itype, init_expr);
8853 : 916 : switch (induction_type)
8854 : : {
8855 : 18 : case vect_step_op_shr:
8856 : 18 : case vect_step_op_shl:
8857 : : /* Build the Initial value from shift_expr. */
8858 : 18 : vec_init = gimple_build_vector_from_val (stmts,
8859 : : vectype,
8860 : : new_name);
8861 : 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8862 : : build_zero_cst (itype), step_expr);
8863 : 18 : vec_init = gimple_build (stmts,
8864 : : (induction_type == vect_step_op_shr
8865 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
8866 : : vectype, vec_init, vec_shift);
8867 : 18 : break;
8868 : :
8869 : 822 : case vect_step_op_neg:
8870 : 822 : {
8871 : 822 : vec_init = gimple_build_vector_from_val (stmts,
8872 : : vectype,
8873 : : new_name);
8874 : 822 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8875 : : vectype, vec_init);
8876 : : /* The encoding has 2 interleaved stepped patterns. */
8877 : 822 : vec_perm_builder sel (nunits, 2, 3);
8878 : 822 : sel.quick_grow (6);
8879 : 4110 : for (i = 0; i < 3; i++)
8880 : : {
8881 : 2466 : sel[2 * i] = i;
8882 : 2466 : sel[2 * i + 1] = i + nunits;
8883 : : }
8884 : 822 : vec_perm_indices indices (sel, 2, nunits);
8885 : : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8886 : : fail when vec_init is const vector. In that situation vec_perm is not
8887 : : really needed. */
8888 : 822 : tree perm_mask_even
8889 : 822 : = vect_gen_perm_mask_any (vectype, indices);
8890 : 822 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8891 : : vectype,
8892 : : vec_init, vec_neg,
8893 : : perm_mask_even);
8894 : 822 : }
8895 : 822 : break;
8896 : :
8897 : 76 : case vect_step_op_mul:
8898 : 76 : {
8899 : : /* Use unsigned mult to avoid UD integer overflow. */
8900 : 76 : gcc_assert (nunits.is_constant (&const_nunits));
8901 : 76 : tree utype = unsigned_type_for (itype);
8902 : 76 : tree uvectype = build_vector_type (utype,
8903 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
8904 : 76 : new_name = gimple_convert (stmts, utype, new_name);
8905 : 76 : vec_init = gimple_build_vector_from_val (stmts,
8906 : : uvectype,
8907 : : new_name);
8908 : 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
8909 : 76 : tree elt_step = build_one_cst (utype);
8910 : :
8911 : 76 : elts.quick_push (elt_step);
8912 : 660 : for (i = 1; i < const_nunits; i++)
8913 : : {
8914 : : /* Create: new_name_i = new_name + step_expr. */
8915 : 508 : elt_step = gimple_build (stmts, MULT_EXPR,
8916 : : utype, elt_step, step_expr);
8917 : 508 : elts.quick_push (elt_step);
8918 : : }
8919 : : /* Create a vector from [new_name_0, new_name_1, ...,
8920 : : new_name_nunits-1]. */
8921 : 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
8922 : 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8923 : : vec_init, vec_mul);
8924 : 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
8925 : 76 : }
8926 : 76 : break;
8927 : :
8928 : 0 : default:
8929 : 0 : gcc_unreachable ();
8930 : : }
8931 : :
8932 : 916 : return vec_init;
8933 : : }
8934 : :
8935 : : /* Peel init_expr by skip_niter for induction_type. */
8936 : : tree
8937 : 84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8938 : : tree skip_niters, tree step_expr,
8939 : : enum vect_induction_op_type induction_type,
8940 : : bool early_exit_p)
8941 : : {
8942 : 84 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST || early_exit_p);
8943 : 84 : tree type = TREE_TYPE (init_expr);
8944 : 84 : unsigned prec = TYPE_PRECISION (type);
8945 : 84 : switch (induction_type)
8946 : : {
8947 : : /* neg inductions are typically not used for loop termination conditions but
8948 : : are typically implemented as b = -b. That is every scalar iteration b is
8949 : : negated. That means that for the initial value of b we will have to
8950 : : determine whether the number of skipped iteration is a multiple of 2
8951 : : because every 2 scalar iterations we are back at "b". */
8952 : 0 : case vect_step_op_neg:
8953 : : /* For early exits the neg induction will always be the same value at the
8954 : : start of the iteration. */
8955 : 0 : if (early_exit_p)
8956 : : break;
8957 : :
8958 : 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
8959 : 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
8960 : : /* else no change. */
8961 : : break;
8962 : :
8963 : 12 : case vect_step_op_shr:
8964 : 12 : case vect_step_op_shl:
8965 : 12 : skip_niters = fold_build1 (NOP_EXPR, type, skip_niters);
8966 : 12 : step_expr = fold_build1 (NOP_EXPR, type, step_expr);
8967 : 12 : step_expr = fold_build2 (MULT_EXPR, type, step_expr, skip_niters);
8968 : : /* When shift mount >= precision, need to avoid UD.
8969 : : In the original loop, there's no UD, and according to semantic,
8970 : : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
8971 : 12 : if ((!tree_fits_uhwi_p (step_expr)
8972 : 12 : || tree_to_uhwi (step_expr) >= prec)
8973 : 6 : && !early_exit_p)
8974 : : {
8975 : 6 : if (induction_type == vect_step_op_shl
8976 : 6 : || TYPE_UNSIGNED (type))
8977 : 4 : init_expr = build_zero_cst (type);
8978 : : else
8979 : 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
8980 : : init_expr,
8981 : 4 : wide_int_to_tree (type, prec - 1));
8982 : : }
8983 : : else
8984 : : {
8985 : 8 : init_expr = fold_build2 ((induction_type == vect_step_op_shr
8986 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
8987 : : type, init_expr, step_expr);
8988 : 6 : init_expr = force_gimple_operand (init_expr, stmts, false, NULL);
8989 : : }
8990 : : break;
8991 : :
8992 : 72 : case vect_step_op_mul:
8993 : 72 : {
8994 : : /* Due to UB we can't support vect_step_op_mul with early break for now.
8995 : : so assert and block. */
8996 : 72 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
8997 : 72 : tree utype = unsigned_type_for (type);
8998 : 72 : init_expr = gimple_convert (stmts, utype, init_expr);
8999 : 72 : wide_int skipn = wi::to_wide (skip_niters);
9000 : 72 : wide_int begin = wi::to_wide (step_expr);
9001 : 72 : auto_mpz base, exp, mod, res;
9002 : 72 : wi::to_mpz (begin, base, TYPE_SIGN (type));
9003 : 72 : wi::to_mpz (skipn, exp, UNSIGNED);
9004 : 72 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9005 : 72 : mpz_powm (res, base, exp, mod);
9006 : 72 : begin = wi::from_mpz (utype, res, true);
9007 : 72 : tree mult_expr = wide_int_to_tree (utype, begin);
9008 : 72 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
9009 : : init_expr, mult_expr);
9010 : 72 : init_expr = gimple_convert (stmts, type, init_expr);
9011 : 72 : }
9012 : 72 : break;
9013 : :
9014 : 0 : default:
9015 : 0 : gcc_unreachable ();
9016 : : }
9017 : :
9018 : 84 : return init_expr;
9019 : : }
9020 : :
9021 : : /* Create vector step for vectorized iv. */
9022 : : static tree
9023 : 1202 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9024 : : poly_uint64 vf,
9025 : : enum vect_induction_op_type induction_type)
9026 : : {
9027 : 1202 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9028 : 1202 : tree new_name = NULL;
9029 : : /* Step should be pow (step, vf) for mult induction. */
9030 : 1202 : if (induction_type == vect_step_op_mul)
9031 : : {
9032 : 76 : gcc_assert (vf.is_constant ());
9033 : 76 : wide_int begin = wi::to_wide (step_expr);
9034 : :
9035 : 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9036 : 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
9037 : :
9038 : 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9039 : 76 : }
9040 : 1126 : else if (induction_type == vect_step_op_neg)
9041 : : /* Do nothing. */
9042 : : ;
9043 : : else
9044 : 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9045 : : expr, step_expr);
9046 : 1202 : return new_name;
9047 : : }
9048 : :
9049 : : static tree
9050 : 1202 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9051 : : stmt_vec_info stmt_info,
9052 : : tree new_name, tree vectype,
9053 : : enum vect_induction_op_type induction_type)
9054 : : {
9055 : : /* No step is needed for neg induction. */
9056 : 1202 : if (induction_type == vect_step_op_neg)
9057 : : return NULL;
9058 : :
9059 : 94 : tree t = unshare_expr (new_name);
9060 : 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
9061 : : || TREE_CODE (new_name) == SSA_NAME);
9062 : 94 : tree new_vec = build_vector_from_val (vectype, t);
9063 : 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9064 : : new_vec, vectype, NULL);
9065 : 94 : return vec_step;
9066 : : }
9067 : :
9068 : : /* Update vectorized iv with vect_step, induc_def is init. */
9069 : : static tree
9070 : 1390 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9071 : : tree induc_def, tree vec_step,
9072 : : enum vect_induction_op_type induction_type)
9073 : : {
9074 : 1390 : tree vec_def = induc_def;
9075 : 1390 : switch (induction_type)
9076 : : {
9077 : 76 : case vect_step_op_mul:
9078 : 76 : {
9079 : : /* Use unsigned mult to avoid UD integer overflow. */
9080 : 76 : tree uvectype = unsigned_type_for (vectype);
9081 : 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
9082 : 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
9083 : 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9084 : : vec_def, vec_step);
9085 : 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
9086 : : }
9087 : 76 : break;
9088 : :
9089 : 12 : case vect_step_op_shr:
9090 : 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9091 : : vec_def, vec_step);
9092 : 12 : break;
9093 : :
9094 : 6 : case vect_step_op_shl:
9095 : 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9096 : : vec_def, vec_step);
9097 : 6 : break;
9098 : : case vect_step_op_neg:
9099 : : vec_def = induc_def;
9100 : : /* Do nothing. */
9101 : : break;
9102 : 0 : default:
9103 : 0 : gcc_unreachable ();
9104 : : }
9105 : :
9106 : 1390 : return vec_def;
9107 : :
9108 : : }
9109 : :
9110 : : /* Function vectorizable_nonlinear_induction
9111 : :
9112 : : Check if STMT_INFO performs an nonlinear induction computation that can be
9113 : : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9114 : : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9115 : : basic block.
9116 : : Return true if STMT_INFO is vectorizable in this way. */
9117 : :
9118 : : static bool
9119 : 10488 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9120 : : stmt_vec_info stmt_info,
9121 : : slp_tree slp_node,
9122 : : stmt_vector_for_cost *cost_vec)
9123 : : {
9124 : 10488 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9125 : 10488 : unsigned ncopies;
9126 : 10488 : bool nested_in_vect_loop = false;
9127 : 10488 : class loop *iv_loop;
9128 : 10488 : tree vec_def;
9129 : 10488 : edge pe = loop_preheader_edge (loop);
9130 : 10488 : basic_block new_bb;
9131 : 10488 : tree vec_init, vec_step;
9132 : 10488 : tree new_name;
9133 : 10488 : gimple *new_stmt;
9134 : 10488 : gphi *induction_phi;
9135 : 10488 : tree induc_def, vec_dest;
9136 : 10488 : tree init_expr, step_expr;
9137 : 10488 : tree niters_skip;
9138 : 10488 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9139 : 10488 : unsigned i;
9140 : 10488 : gimple_stmt_iterator si;
9141 : :
9142 : 10488 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9143 : :
9144 : 10488 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9145 : 10488 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9146 : 10488 : enum vect_induction_op_type induction_type
9147 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9148 : :
9149 : 10488 : gcc_assert (induction_type > vect_step_op_add);
9150 : :
9151 : 10488 : ncopies = vect_get_num_copies (loop_vinfo, slp_node);
9152 : 10488 : gcc_assert (ncopies >= 1);
9153 : :
9154 : : /* FORNOW. Only handle nonlinear induction in the same loop. */
9155 : 10488 : if (nested_in_vect_loop_p (loop, stmt_info))
9156 : : {
9157 : 0 : if (dump_enabled_p ())
9158 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9159 : : "nonlinear induction in nested loop.\n");
9160 : 0 : return false;
9161 : : }
9162 : :
9163 : 10488 : iv_loop = loop;
9164 : 10488 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9165 : :
9166 : : /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
9167 : : vector iv update for each iv and a permutation to generate wanted
9168 : : vector iv. */
9169 : 10488 : if (SLP_TREE_LANES (slp_node) > 1)
9170 : : {
9171 : 0 : if (dump_enabled_p ())
9172 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9173 : : "SLP induction not supported for nonlinear"
9174 : : " induction.\n");
9175 : 0 : return false;
9176 : : }
9177 : :
9178 : 10488 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9179 : : {
9180 : 0 : if (dump_enabled_p ())
9181 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9182 : : "floating point nonlinear induction vectorization"
9183 : : " not supported.\n");
9184 : 0 : return false;
9185 : : }
9186 : :
9187 : 10488 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9188 : 10488 : init_expr = vect_phi_initial_value (phi);
9189 : 10488 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9190 : : && TREE_CODE (step_expr) == INTEGER_CST);
9191 : : /* step_expr should be aligned with init_expr,
9192 : : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9193 : 10488 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9194 : :
9195 : 10488 : if (TREE_CODE (init_expr) == INTEGER_CST)
9196 : 2895 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9197 : 7593 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9198 : : {
9199 : : /* INIT_EXPR could be a bit_field, bail out for such case. */
9200 : 4 : if (dump_enabled_p ())
9201 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9202 : : "nonlinear induction vectorization failed:"
9203 : : " component type of vectype is not a nop conversion"
9204 : : " from type of init_expr.\n");
9205 : 4 : return false;
9206 : : }
9207 : :
9208 : 10484 : switch (induction_type)
9209 : : {
9210 : 2538 : case vect_step_op_neg:
9211 : 2538 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9212 : : return false;
9213 : 2534 : if (TREE_CODE (init_expr) != INTEGER_CST
9214 : 190 : && TREE_CODE (init_expr) != REAL_CST)
9215 : : {
9216 : : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9217 : 190 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9218 : 0 : return false;
9219 : :
9220 : : /* The encoding has 2 interleaved stepped patterns. */
9221 : 190 : vec_perm_builder sel (nunits, 2, 3);
9222 : 190 : machine_mode mode = TYPE_MODE (vectype);
9223 : 190 : sel.quick_grow (6);
9224 : 950 : for (i = 0; i < 3; i++)
9225 : : {
9226 : 570 : sel[i * 2] = i;
9227 : 570 : sel[i * 2 + 1] = i + nunits;
9228 : : }
9229 : 190 : vec_perm_indices indices (sel, 2, nunits);
9230 : 190 : if (!can_vec_perm_const_p (mode, mode, indices))
9231 : 0 : return false;
9232 : 190 : }
9233 : : break;
9234 : :
9235 : 734 : case vect_step_op_mul:
9236 : 734 : {
9237 : : /* Check for backend support of MULT_EXPR. */
9238 : 734 : if (!directly_supported_p (MULT_EXPR, vectype))
9239 : : return false;
9240 : :
9241 : : /* ?? How to construct vector step for variable number vector.
9242 : : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9243 : : if (!vf.is_constant ())
9244 : : return false;
9245 : : }
9246 : : break;
9247 : :
9248 : 7108 : case vect_step_op_shr:
9249 : : /* Check for backend support of RSHIFT_EXPR. */
9250 : 7108 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9251 : : return false;
9252 : :
9253 : : /* Don't shift more than type precision to avoid UD. */
9254 : 26 : if (!tree_fits_uhwi_p (step_expr)
9255 : 26 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9256 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9257 : : return false;
9258 : : break;
9259 : :
9260 : 104 : case vect_step_op_shl:
9261 : : /* Check for backend support of RSHIFT_EXPR. */
9262 : 104 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9263 : : return false;
9264 : :
9265 : : /* Don't shift more than type precision to avoid UD. */
9266 : 12 : if (!tree_fits_uhwi_p (step_expr)
9267 : 12 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9268 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9269 : : return false;
9270 : :
9271 : : break;
9272 : :
9273 : 0 : default:
9274 : 0 : gcc_unreachable ();
9275 : : }
9276 : :
9277 : 3152 : if (cost_vec) /* transformation not required. */
9278 : : {
9279 : 2236 : unsigned inside_cost = 0, prologue_cost = 0;
9280 : : /* loop cost for vec_loop. Neg induction doesn't have any
9281 : : inside_cost. */
9282 : 2236 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9283 : : slp_node, 0, vect_body);
9284 : :
9285 : : /* loop cost for vec_loop. Neg induction doesn't have any
9286 : : inside_cost. */
9287 : 2236 : if (induction_type == vect_step_op_neg)
9288 : 1712 : inside_cost = 0;
9289 : :
9290 : : /* prologue cost for vec_init and vec_step. */
9291 : 2236 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9292 : : slp_node, 0, vect_prologue);
9293 : :
9294 : 2236 : if (dump_enabled_p ())
9295 : 60 : dump_printf_loc (MSG_NOTE, vect_location,
9296 : : "vect_model_induction_cost: inside_cost = %d, "
9297 : : "prologue_cost = %d. \n", inside_cost,
9298 : : prologue_cost);
9299 : :
9300 : 2236 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9301 : 2236 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9302 : 2236 : return true;
9303 : : }
9304 : :
9305 : : /* Transform. */
9306 : :
9307 : : /* Compute a vector variable, initialized with the first VF values of
9308 : : the induction variable. E.g., for an iv with IV_PHI='X' and
9309 : : evolution S, for a vector of 4 units, we want to compute:
9310 : : [X, X + S, X + 2*S, X + 3*S]. */
9311 : :
9312 : 916 : if (dump_enabled_p ())
9313 : 32 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9314 : :
9315 : 916 : pe = loop_preheader_edge (iv_loop);
9316 : : /* Find the first insertion point in the BB. */
9317 : 916 : basic_block bb = gimple_bb (phi);
9318 : 916 : si = gsi_after_labels (bb);
9319 : :
9320 : 916 : gimple_seq stmts = NULL;
9321 : :
9322 : 916 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9323 : : /* If we are using the loop mask to "peel" for alignment then we need
9324 : : to adjust the start value here. */
9325 : 916 : if (niters_skip != NULL_TREE)
9326 : 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9327 : : step_expr, induction_type, false);
9328 : :
9329 : 916 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9330 : : step_expr, nunits, vectype,
9331 : : induction_type);
9332 : 916 : if (stmts)
9333 : : {
9334 : 162 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9335 : 162 : gcc_assert (!new_bb);
9336 : : }
9337 : :
9338 : 916 : stmts = NULL;
9339 : 916 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9340 : : vf, induction_type);
9341 : 916 : if (stmts)
9342 : : {
9343 : 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9344 : 0 : gcc_assert (!new_bb);
9345 : : }
9346 : :
9347 : 916 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9348 : : new_name, vectype,
9349 : : induction_type);
9350 : : /* Create the following def-use cycle:
9351 : : loop prolog:
9352 : : vec_init = ...
9353 : : vec_step = ...
9354 : : loop:
9355 : : vec_iv = PHI <vec_init, vec_loop>
9356 : : ...
9357 : : STMT
9358 : : ...
9359 : : vec_loop = vec_iv + vec_step; */
9360 : :
9361 : : /* Create the induction-phi that defines the induction-operand. */
9362 : 916 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9363 : 916 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9364 : 916 : induc_def = PHI_RESULT (induction_phi);
9365 : :
9366 : : /* Create the iv update inside the loop. */
9367 : 916 : stmts = NULL;
9368 : 916 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9369 : : induc_def, vec_step,
9370 : : induction_type);
9371 : :
9372 : 916 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9373 : 916 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9374 : :
9375 : : /* Set the arguments of the phi node: */
9376 : 916 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9377 : 916 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9378 : : UNKNOWN_LOCATION);
9379 : :
9380 : 916 : slp_node->push_vec_def (induction_phi);
9381 : :
9382 : : /* In case that vectorization factor (VF) is bigger than the number
9383 : : of elements that we can fit in a vectype (nunits), we have to generate
9384 : : more than one vector stmt - i.e - we need to "unroll" the
9385 : : vector stmt by a factor VF/nunits. For more details see documentation
9386 : : in vectorizable_operation. */
9387 : :
9388 : 916 : if (ncopies > 1)
9389 : : {
9390 : 286 : stmts = NULL;
9391 : : /* FORNOW. This restriction should be relaxed. */
9392 : 286 : gcc_assert (!nested_in_vect_loop);
9393 : :
9394 : 286 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9395 : : nunits, induction_type);
9396 : :
9397 : 286 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9398 : : new_name, vectype,
9399 : : induction_type);
9400 : 286 : vec_def = induc_def;
9401 : 1046 : for (i = 1; i < ncopies; i++)
9402 : : {
9403 : : /* vec_i = vec_prev + vec_step. */
9404 : 474 : stmts = NULL;
9405 : 474 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9406 : : vec_def, vec_step,
9407 : : induction_type);
9408 : 474 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9409 : 474 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9410 : 474 : slp_node->push_vec_def (new_stmt);
9411 : : }
9412 : : }
9413 : :
9414 : 916 : if (dump_enabled_p ())
9415 : 64 : dump_printf_loc (MSG_NOTE, vect_location,
9416 : : "transform induction: created def-use cycle: %G%G",
9417 : 32 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9418 : :
9419 : : return true;
9420 : : }
9421 : :
9422 : : /* Function vectorizable_induction
9423 : :
9424 : : Check if STMT_INFO performs an induction computation that can be vectorized.
9425 : : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9426 : : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9427 : : Return true if STMT_INFO is vectorizable in this way. */
9428 : :
9429 : : bool
9430 : 269090 : vectorizable_induction (loop_vec_info loop_vinfo,
9431 : : stmt_vec_info stmt_info,
9432 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9433 : : {
9434 : 269090 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9435 : 269090 : bool nested_in_vect_loop = false;
9436 : 269090 : class loop *iv_loop;
9437 : 269090 : tree vec_def;
9438 : 269090 : edge pe = loop_preheader_edge (loop);
9439 : 269090 : basic_block new_bb;
9440 : 269090 : tree vec_init = NULL_TREE, vec_step, t;
9441 : 269090 : tree new_name;
9442 : 269090 : gphi *induction_phi;
9443 : 269090 : tree induc_def, vec_dest;
9444 : 269090 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9445 : 269090 : unsigned i;
9446 : 269090 : tree expr;
9447 : 269090 : tree index_vectype = NULL_TREE;
9448 : 269090 : gimple_stmt_iterator si;
9449 : 269090 : enum vect_induction_op_type induction_type
9450 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9451 : :
9452 : 292534 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9453 : 138545 : if (!phi)
9454 : : return false;
9455 : :
9456 : 138545 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
9457 : : return false;
9458 : :
9459 : : /* Make sure it was recognized as induction computation. */
9460 : 138545 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9461 : : return false;
9462 : :
9463 : : /* Handle nonlinear induction in a separate place. */
9464 : 134997 : if (induction_type != vect_step_op_add)
9465 : 10488 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9466 : 10488 : slp_node, cost_vec);
9467 : :
9468 : 124509 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9469 : 124509 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9470 : :
9471 : : /* FORNOW. These restrictions should be relaxed. */
9472 : 124509 : if (nested_in_vect_loop_p (loop, stmt_info))
9473 : : {
9474 : 602 : imm_use_iterator imm_iter;
9475 : 602 : use_operand_p use_p;
9476 : 602 : gimple *exit_phi;
9477 : 602 : edge latch_e;
9478 : 602 : tree loop_arg;
9479 : :
9480 : 602 : exit_phi = NULL;
9481 : 602 : latch_e = loop_latch_edge (loop->inner);
9482 : 602 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9483 : 1848 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9484 : : {
9485 : 654 : gimple *use_stmt = USE_STMT (use_p);
9486 : 654 : if (is_gimple_debug (use_stmt))
9487 : 36 : continue;
9488 : :
9489 : 618 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9490 : : {
9491 : : exit_phi = use_stmt;
9492 : : break;
9493 : : }
9494 : 602 : }
9495 : 602 : if (exit_phi)
9496 : : {
9497 : 10 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9498 : 10 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9499 : 6 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9500 : : {
9501 : 4 : if (dump_enabled_p ())
9502 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9503 : : "inner-loop induction only used outside "
9504 : : "of the outer vectorized loop.\n");
9505 : 4 : return false;
9506 : : }
9507 : : }
9508 : :
9509 : 598 : nested_in_vect_loop = true;
9510 : 598 : iv_loop = loop->inner;
9511 : : }
9512 : : else
9513 : : iv_loop = loop;
9514 : 124505 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9515 : :
9516 : 124505 : if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
9517 : : {
9518 : : /* The current SLP code creates the step value element-by-element. */
9519 : : if (dump_enabled_p ())
9520 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9521 : : "SLP induction not supported for variable-length"
9522 : : " vectors.\n");
9523 : : return false;
9524 : : }
9525 : :
9526 : 124505 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9527 : : {
9528 : 12 : if (dump_enabled_p ())
9529 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9530 : : "floating point induction vectorization disabled\n");
9531 : 12 : return false;
9532 : : }
9533 : :
9534 : 124493 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9535 : 124493 : gcc_assert (step_expr != NULL_TREE);
9536 : 248940 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
9537 : 248848 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
9538 : : {
9539 : 12 : if (dump_enabled_p ())
9540 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9541 : : "bit-precision induction vectorization not "
9542 : : "supported.\n");
9543 : 12 : return false;
9544 : : }
9545 : 124481 : tree stept = TREE_TYPE (step_expr);
9546 : 124481 : tree step_vectype = get_same_sized_vectype (stept, vectype);
9547 : 124481 : stept = TREE_TYPE (step_vectype);
9548 : :
9549 : : /* Check for target support of the vectorized arithmetic used here. */
9550 : 124481 : if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
9551 : 124481 : || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
9552 : 19868 : return false;
9553 : 104613 : if (!nunits.is_constant ())
9554 : : {
9555 : : if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
9556 : : return false;
9557 : : /* FLOAT_EXPR when computing VEC_INIT for float inductions. */
9558 : : if (SCALAR_FLOAT_TYPE_P (stept))
9559 : : {
9560 : : tree index_type = build_nonstandard_integer_type
9561 : : (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
9562 : :
9563 : : index_vectype = build_vector_type (index_type, nunits);
9564 : : if (!can_float_p (TYPE_MODE (step_vectype),
9565 : : TYPE_MODE (index_vectype), 1))
9566 : : return false;
9567 : : }
9568 : : }
9569 : :
9570 : 104613 : unsigned nvects = vect_get_num_copies (loop_vinfo, slp_node);
9571 : 104613 : if (cost_vec) /* transformation not required. */
9572 : : {
9573 : 268995 : unsigned inside_cost = 0, prologue_cost = 0;
9574 : : /* We eventually need to set a vector type on invariant
9575 : : arguments. */
9576 : : unsigned j;
9577 : : slp_tree child;
9578 : 268995 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9579 : 179330 : if (!vect_maybe_update_slp_op_vectype
9580 : 179330 : (child, SLP_TREE_VECTYPE (slp_node)))
9581 : : {
9582 : 0 : if (dump_enabled_p ())
9583 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9584 : : "incompatible vector types for "
9585 : : "invariants\n");
9586 : 0 : return false;
9587 : : }
9588 : : /* loop cost for vec_loop. */
9589 : 89665 : inside_cost = record_stmt_cost (cost_vec, nvects,
9590 : : vector_stmt, slp_node, 0, vect_body);
9591 : : /* prologue cost for vec_init (if not nested) and step. */
9592 : 89665 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9593 : : scalar_to_vec,
9594 : : slp_node, 0, vect_prologue);
9595 : 89665 : if (dump_enabled_p ())
9596 : 3618 : dump_printf_loc (MSG_NOTE, vect_location,
9597 : : "vect_model_induction_cost: inside_cost = %d, "
9598 : : "prologue_cost = %d .\n", inside_cost,
9599 : : prologue_cost);
9600 : :
9601 : 89665 : SLP_TREE_TYPE (slp_node) = induc_vec_info_type;
9602 : 89665 : DUMP_VECT_SCOPE ("vectorizable_induction");
9603 : 89665 : return true;
9604 : : }
9605 : :
9606 : : /* Transform. */
9607 : :
9608 : : /* Compute a vector variable, initialized with the first VF values of
9609 : : the induction variable. E.g., for an iv with IV_PHI='X' and
9610 : : evolution S, for a vector of 4 units, we want to compute:
9611 : : [X, X + S, X + 2*S, X + 3*S]. */
9612 : :
9613 : 14948 : if (dump_enabled_p ())
9614 : 2503 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9615 : :
9616 : 14948 : pe = loop_preheader_edge (iv_loop);
9617 : : /* Find the first insertion point in the BB. */
9618 : 14948 : basic_block bb = gimple_bb (phi);
9619 : 14948 : si = gsi_after_labels (bb);
9620 : :
9621 : : /* For SLP induction we have to generate several IVs as for example
9622 : : with group size 3 we need
9623 : : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9624 : : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9625 : 14948 : gimple_stmt_iterator incr_si;
9626 : 14948 : bool insert_after;
9627 : 14948 : standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
9628 : :
9629 : : /* The initial values are vectorized, but any lanes > group_size
9630 : : need adjustment. */
9631 : 14948 : slp_tree init_node
9632 : 14948 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9633 : :
9634 : : /* Gather steps. Since we do not vectorize inductions as
9635 : : cycles we have to reconstruct the step from SCEV data. */
9636 : 14948 : unsigned group_size = SLP_TREE_LANES (slp_node);
9637 : 14948 : tree *steps = XALLOCAVEC (tree, group_size);
9638 : 14948 : tree *inits = XALLOCAVEC (tree, group_size);
9639 : 14948 : stmt_vec_info phi_info;
9640 : 46049 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9641 : : {
9642 : 16153 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9643 : 16153 : if (!init_node)
9644 : 15977 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9645 : : pe->dest_idx);
9646 : : }
9647 : :
9648 : : /* Now generate the IVs. */
9649 : 29896 : gcc_assert (multiple_p (nunits * nvects, group_size));
9650 : 14948 : unsigned nivs;
9651 : 14948 : unsigned HOST_WIDE_INT const_nunits;
9652 : 14948 : if (nested_in_vect_loop)
9653 : : nivs = nvects;
9654 : 14786 : else if (nunits.is_constant (&const_nunits))
9655 : : {
9656 : : /* Compute the number of distinct IVs we need. First reduce
9657 : : group_size if it is a multiple of const_nunits so we get
9658 : : one IV for a group_size of 4 but const_nunits 2. */
9659 : 14786 : unsigned group_sizep = group_size;
9660 : 14786 : if (group_sizep % const_nunits == 0)
9661 : 109 : group_sizep = group_sizep / const_nunits;
9662 : 14786 : nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
9663 : : }
9664 : : else
9665 : : {
9666 : : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9667 : : nivs = 1;
9668 : : }
9669 : 14948 : gimple_seq init_stmts = NULL;
9670 : 14948 : tree lupdate_mul = NULL_TREE;
9671 : 162 : if (!nested_in_vect_loop)
9672 : : {
9673 : 14786 : if (nunits.is_constant (&const_nunits))
9674 : : {
9675 : : /* The number of iterations covered in one vector iteration. */
9676 : 14786 : unsigned lup_mul = (nvects * const_nunits) / group_size;
9677 : 14786 : lupdate_mul
9678 : 14786 : = build_vector_from_val (step_vectype,
9679 : 14786 : SCALAR_FLOAT_TYPE_P (stept)
9680 : 27 : ? build_real_from_wide (stept, lup_mul,
9681 : : UNSIGNED)
9682 : 29545 : : build_int_cstu (stept, lup_mul));
9683 : : }
9684 : : else
9685 : : {
9686 : : if (SCALAR_FLOAT_TYPE_P (stept))
9687 : : {
9688 : : tree tem = build_int_cst (integer_type_node, vf);
9689 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9690 : : }
9691 : : else
9692 : : lupdate_mul = build_int_cst (stept, vf);
9693 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9694 : : lupdate_mul);
9695 : : }
9696 : : }
9697 : 14948 : tree peel_mul = NULL_TREE;
9698 : 14948 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9699 : : {
9700 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9701 : 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9702 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9703 : : else
9704 : 0 : peel_mul = gimple_convert (&init_stmts, stept,
9705 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9706 : 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
9707 : : step_vectype, peel_mul);
9708 : : }
9709 : 14948 : tree step_mul = NULL_TREE;
9710 : 14948 : unsigned ivn;
9711 : 14948 : auto_vec<tree> vec_steps;
9712 : 30462 : for (ivn = 0; ivn < nivs; ++ivn)
9713 : : {
9714 : 15514 : gimple_seq stmts = NULL;
9715 : 15514 : bool invariant = true;
9716 : 15514 : if (nunits.is_constant (&const_nunits))
9717 : : {
9718 : 15514 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9719 : 15514 : tree_vector_builder init_elts (vectype, const_nunits, 1);
9720 : 15514 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9721 : 101820 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9722 : : {
9723 : : /* The scalar steps of the IVs. */
9724 : 86306 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9725 : 86306 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9726 : 86306 : step_elts.quick_push (elt);
9727 : 86306 : if (!init_node)
9728 : : {
9729 : : /* The scalar inits of the IVs if not vectorized. */
9730 : 85344 : elt = inits[(ivn*const_nunits + eltn) % group_size];
9731 : 85344 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
9732 : 85344 : TREE_TYPE (elt)))
9733 : 220 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9734 : 220 : TREE_TYPE (vectype), elt);
9735 : 85344 : init_elts.quick_push (elt);
9736 : : }
9737 : : /* The number of steps to add to the initial values. */
9738 : 86306 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9739 : 172612 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9740 : 172514 : ? build_real_from_wide (stept, mul_elt,
9741 : : UNSIGNED)
9742 : 172514 : : build_int_cstu (stept, mul_elt));
9743 : : }
9744 : 15514 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
9745 : 15514 : step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9746 : 15514 : if (!init_node)
9747 : 15320 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
9748 : 15514 : }
9749 : : else
9750 : : {
9751 : : if (init_node)
9752 : : ;
9753 : : else if (INTEGRAL_TYPE_P (TREE_TYPE (steps[0])))
9754 : : {
9755 : : new_name = gimple_convert (&init_stmts, stept, inits[0]);
9756 : : /* Build the initial value directly as a VEC_SERIES_EXPR. */
9757 : : vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
9758 : : step_vectype, new_name, steps[0]);
9759 : : if (!useless_type_conversion_p (vectype, step_vectype))
9760 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9761 : : vectype, vec_init);
9762 : : }
9763 : : else
9764 : : {
9765 : : /* Build:
9766 : : [base, base, base, ...]
9767 : : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9768 : : gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (steps[0])));
9769 : : gcc_assert (flag_associative_math);
9770 : : gcc_assert (index_vectype != NULL_TREE);
9771 : :
9772 : : tree index = build_index_vector (index_vectype, 0, 1);
9773 : : new_name = gimple_convert (&init_stmts, TREE_TYPE (steps[0]),
9774 : : inits[0]);
9775 : : tree base_vec = gimple_build_vector_from_val (&init_stmts,
9776 : : step_vectype,
9777 : : new_name);
9778 : : tree step_vec = gimple_build_vector_from_val (&init_stmts,
9779 : : step_vectype,
9780 : : steps[0]);
9781 : : vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
9782 : : step_vectype, index);
9783 : : vec_init = gimple_build (&init_stmts, MULT_EXPR,
9784 : : step_vectype, vec_init, step_vec);
9785 : : vec_init = gimple_build (&init_stmts, PLUS_EXPR,
9786 : : step_vectype, vec_init, base_vec);
9787 : : if (!useless_type_conversion_p (vectype, step_vectype))
9788 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9789 : : vectype, vec_init);
9790 : : }
9791 : : /* iv_loop is nested in the loop to be vectorized. Generate:
9792 : : vec_step = [S, S, S, S] */
9793 : : t = unshare_expr (steps[0]);
9794 : : gcc_assert (CONSTANT_CLASS_P (t)
9795 : : || TREE_CODE (t) == SSA_NAME);
9796 : : vec_step = gimple_build_vector_from_val (&init_stmts,
9797 : : step_vectype, t);
9798 : : }
9799 : 15514 : vec_steps.safe_push (vec_step);
9800 : 15514 : if (peel_mul)
9801 : : {
9802 : 0 : if (!step_mul)
9803 : : {
9804 : 0 : gcc_assert (!nunits.is_constant ());
9805 : : step_mul = gimple_build (&init_stmts,
9806 : : MINUS_EXPR, step_vectype,
9807 : : build_zero_cst (step_vectype), peel_mul);
9808 : : }
9809 : : else
9810 : 0 : step_mul = gimple_build (&init_stmts,
9811 : : MINUS_EXPR, step_vectype,
9812 : : step_mul, peel_mul);
9813 : : }
9814 : :
9815 : : /* Create the induction-phi that defines the induction-operand. */
9816 : 15514 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9817 : : "vec_iv_");
9818 : 15514 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9819 : 15514 : induc_def = PHI_RESULT (induction_phi);
9820 : :
9821 : : /* Create the iv update inside the loop */
9822 : 15514 : tree up = vec_step;
9823 : 15514 : if (lupdate_mul)
9824 : : {
9825 : 15320 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
9826 : : {
9827 : : /* When we're using loop_len produced by SELEC_VL, the
9828 : : non-final iterations are not always processing VF
9829 : : elements. So vectorize induction variable instead of
9830 : :
9831 : : _21 = vect_vec_iv_.6_22 + { VF, ... };
9832 : :
9833 : : We should generate:
9834 : :
9835 : : _35 = .SELECT_VL (ivtmp_33, VF);
9836 : : vect_cst__22 = [vec_duplicate_expr] _35;
9837 : : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
9838 : 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
9839 : 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
9840 : : vectype, 0, 0);
9841 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
9842 : 0 : expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
9843 : : else
9844 : 0 : expr = gimple_convert (&stmts, stept, len);
9845 : 0 : lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
9846 : : expr);
9847 : 0 : up = gimple_build (&stmts, MULT_EXPR,
9848 : : step_vectype, vec_step, lupdate_mul);
9849 : : }
9850 : : else
9851 : 15320 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9852 : : vec_step, lupdate_mul);
9853 : : }
9854 : 15514 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9855 : 15514 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
9856 : 15514 : vec_def = gimple_convert (&stmts, vectype, vec_def);
9857 : 15514 : insert_iv_increment (&incr_si, insert_after, stmts);
9858 : 15514 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9859 : : UNKNOWN_LOCATION);
9860 : :
9861 : 15514 : if (init_node)
9862 : 194 : vec_init = vect_get_slp_vect_def (init_node, ivn);
9863 : 15514 : if (!nested_in_vect_loop
9864 : 15514 : && step_mul
9865 : 15514 : && !integer_zerop (step_mul))
9866 : : {
9867 : 14886 : gcc_assert (invariant);
9868 : 14886 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9869 : 14886 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9870 : : vec_step, step_mul);
9871 : 14886 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9872 : : vec_def, up);
9873 : 14886 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9874 : : }
9875 : :
9876 : : /* Set the arguments of the phi node: */
9877 : 15514 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9878 : :
9879 : 15514 : slp_node->push_vec_def (induction_phi);
9880 : : }
9881 : 14948 : if (!nested_in_vect_loop)
9882 : : {
9883 : : /* Fill up to the number of vectors we need for the whole group. */
9884 : 14786 : if (nunits.is_constant (&const_nunits))
9885 : 14786 : nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
9886 : : else
9887 : : nivs = 1;
9888 : 14786 : vec_steps.reserve (nivs-ivn);
9889 : 29593 : for (; ivn < nivs; ++ivn)
9890 : : {
9891 : 21 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9892 : 21 : vec_steps.quick_push (vec_steps[0]);
9893 : : }
9894 : : }
9895 : :
9896 : : /* Re-use IVs when we can. We are generating further vector
9897 : : stmts by adding VF' * stride to the IVs generated above. */
9898 : 14948 : if (ivn < nvects)
9899 : : {
9900 : 3305 : if (nunits.is_constant (&const_nunits))
9901 : : {
9902 : 3305 : unsigned vfp = (least_common_multiple (group_size, const_nunits)
9903 : 3305 : / group_size);
9904 : 3305 : lupdate_mul
9905 : 3305 : = build_vector_from_val (step_vectype,
9906 : 3305 : SCALAR_FLOAT_TYPE_P (stept)
9907 : 8 : ? build_real_from_wide (stept,
9908 : 8 : vfp, UNSIGNED)
9909 : 6602 : : build_int_cstu (stept, vfp));
9910 : : }
9911 : : else
9912 : : {
9913 : : if (SCALAR_FLOAT_TYPE_P (stept))
9914 : : {
9915 : : tree tem = build_int_cst (integer_type_node, nunits);
9916 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
9917 : : }
9918 : : else
9919 : : lupdate_mul = build_int_cst (stept, nunits);
9920 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
9921 : : lupdate_mul);
9922 : : }
9923 : 10684 : for (; ivn < nvects; ++ivn)
9924 : : {
9925 : 7379 : gimple *iv
9926 : 7379 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
9927 : 7379 : tree def = gimple_get_lhs (iv);
9928 : 7379 : if (ivn < 2*nivs)
9929 : 3397 : vec_steps[ivn - nivs]
9930 : 3397 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9931 : 3397 : vec_steps[ivn - nivs], lupdate_mul);
9932 : 7379 : gimple_seq stmts = NULL;
9933 : 7379 : def = gimple_convert (&stmts, step_vectype, def);
9934 : 22137 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9935 : 7379 : def, vec_steps[ivn % nivs]);
9936 : 7379 : def = gimple_convert (&stmts, vectype, def);
9937 : 7379 : if (gimple_code (iv) == GIMPLE_PHI)
9938 : 3397 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9939 : : else
9940 : : {
9941 : 3982 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9942 : 3982 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9943 : : }
9944 : 7379 : slp_node->push_vec_def (def);
9945 : : }
9946 : : }
9947 : :
9948 : 14948 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9949 : 14948 : gcc_assert (!new_bb);
9950 : :
9951 : 14948 : return true;
9952 : 14948 : }
9953 : :
9954 : : /* Function vectorizable_live_operation_1.
9955 : :
9956 : : helper function for vectorizable_live_operation. */
9957 : :
9958 : : static tree
9959 : 2680 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo, basic_block exit_bb,
9960 : : tree vectype, slp_tree slp_node,
9961 : : tree bitsize, tree bitstart, tree vec_lhs,
9962 : : tree lhs_type, gimple_stmt_iterator *exit_gsi)
9963 : : {
9964 : 2680 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
9965 : :
9966 : 2680 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
9967 : 2680 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
9968 : 5360 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
9969 : 2680 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
9970 : :
9971 : 2680 : gimple_seq stmts = NULL;
9972 : 2680 : tree new_tree;
9973 : :
9974 : : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
9975 : 2680 : if (integer_zerop (bitstart))
9976 : : {
9977 : 161 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
9978 : : vec_lhs_phi, bitsize, bitstart);
9979 : :
9980 : : /* Convert the extracted vector element to the scalar type. */
9981 : 161 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
9982 : : }
9983 : 2519 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9984 : : {
9985 : : /* Emit:
9986 : :
9987 : : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - (BIAS + 1)>
9988 : :
9989 : : where VEC_LHS is the vectorized live-out result, LEN is the length of
9990 : : the vector, BIAS is the load-store bias. The bias should not be used
9991 : : at all since we are not using load/store operations, but LEN will be
9992 : : REALLEN + BIAS, so subtract it to get to the correct position. */
9993 : 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9994 : 0 : gimple_seq tem = NULL;
9995 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
9996 : 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
9997 : : &LOOP_VINFO_LENS (loop_vinfo),
9998 : : 1, vectype, 0, 1);
9999 : 0 : gimple_seq_add_seq (&stmts, tem);
10000 : :
10001 : : /* BIAS + 1. */
10002 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10003 : 0 : tree bias_plus_one
10004 : 0 : = int_const_binop (PLUS_EXPR,
10005 : 0 : build_int_cst (TREE_TYPE (len), biasval),
10006 : 0 : build_one_cst (TREE_TYPE (len)));
10007 : :
10008 : : /* LAST_INDEX = LEN - (BIAS + 1). */
10009 : 0 : tree last_index = gimple_build (&stmts, MINUS_EXPR, TREE_TYPE (len),
10010 : : len, bias_plus_one);
10011 : :
10012 : : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN - (BIAS + 1)>. */
10013 : 0 : tree scalar_res
10014 : 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10015 : : vec_lhs_phi, last_index);
10016 : :
10017 : : /* Convert the extracted vector element to the scalar type. */
10018 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10019 : : }
10020 : 2519 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10021 : : {
10022 : : /* Emit:
10023 : :
10024 : : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10025 : :
10026 : : where VEC_LHS is the vectorized live-out result and MASK is
10027 : : the loop mask for the final iteration. */
10028 : 0 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10029 : 0 : tree scalar_type = TREE_TYPE (vectype);
10030 : 0 : gimple_seq tem = NULL;
10031 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10032 : 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10033 : : &LOOP_VINFO_MASKS (loop_vinfo),
10034 : : 1, vectype, 0);
10035 : 0 : tree scalar_res;
10036 : 0 : gimple_seq_add_seq (&stmts, tem);
10037 : :
10038 : 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10039 : : mask, vec_lhs_phi);
10040 : :
10041 : : /* Convert the extracted vector element to the scalar type. */
10042 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10043 : : }
10044 : : else
10045 : : {
10046 : 2519 : tree bftype = TREE_TYPE (vectype);
10047 : 2519 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10048 : 85 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10049 : 2519 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10050 : 2519 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10051 : : &stmts, true, NULL_TREE);
10052 : : }
10053 : :
10054 : 2680 : *exit_gsi = gsi_after_labels (exit_bb);
10055 : 2680 : if (stmts)
10056 : 2680 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10057 : :
10058 : 2680 : return new_tree;
10059 : : }
10060 : :
10061 : : /* Function vectorizable_live_operation.
10062 : :
10063 : : STMT_INFO computes a value that is used outside the loop. Check if
10064 : : it can be supported. */
10065 : :
10066 : : bool
10067 : 234390 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10068 : : slp_tree slp_node, slp_instance slp_node_instance,
10069 : : int slp_index, bool vec_stmt_p,
10070 : : stmt_vector_for_cost *cost_vec)
10071 : : {
10072 : 234390 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10073 : 234390 : imm_use_iterator imm_iter;
10074 : 234390 : tree lhs, lhs_type, bitsize;
10075 : 234390 : tree vectype = SLP_TREE_VECTYPE (slp_node);
10076 : 234390 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10077 : 234390 : gimple *use_stmt;
10078 : 234390 : use_operand_p use_p;
10079 : 234390 : auto_vec<tree> vec_oprnds;
10080 : 234390 : int vec_entry = 0;
10081 : 234390 : poly_uint64 vec_index = 0;
10082 : :
10083 : 234390 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10084 : : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10085 : :
10086 : : /* If a stmt of a reduction is live, vectorize it via
10087 : : vect_create_epilog_for_reduction. vectorizable_reduction assessed
10088 : : validity so just trigger the transform here. */
10089 : 234390 : if (vect_is_reduction (slp_node))
10090 : : {
10091 : 56953 : if (!vec_stmt_p)
10092 : : return true;
10093 : : /* For SLP reductions we vectorize the epilogue for all involved stmts
10094 : : together. For SLP reduction chains we only get here once. */
10095 : 23270 : if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
10096 : 23027 : && slp_index != 0)
10097 : : return true;
10098 : 22807 : vect_reduc_info reduc_info = info_for_reduction (loop_vinfo, slp_node);
10099 : 22807 : if (VECT_REDUC_INFO_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10100 : 22807 : || VECT_REDUC_INFO_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10101 : : return true;
10102 : :
10103 : 21977 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10104 : 21977 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10105 : 21973 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10106 : : slp_node_instance,
10107 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10108 : :
10109 : : /* If early break we only have to materialize the reduction on the merge
10110 : : block, but we have to find an alternate exit first. */
10111 : 21977 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10112 : : {
10113 : 23 : slp_tree phis_node = slp_node_instance->reduc_phis;
10114 : 23 : stmt_info = SLP_TREE_REPRESENTATIVE (phis_node);
10115 : 69 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10116 : 23 : if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10117 : : {
10118 : 23 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10119 : : phis_node, slp_node_instance,
10120 : : exit);
10121 : 23 : break;
10122 : 23 : }
10123 : 23 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10124 : 4 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10125 : : phis_node, slp_node_instance,
10126 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10127 : : }
10128 : :
10129 : 21977 : return true;
10130 : : }
10131 : :
10132 : : /* If STMT is not relevant and it is a simple assignment and its inputs are
10133 : : invariant then it can remain in place, unvectorized. The original last
10134 : : scalar value that it computes will be used. */
10135 : 177437 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10136 : : {
10137 : 0 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10138 : 0 : if (dump_enabled_p ())
10139 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
10140 : : "statement is simple and uses invariant. Leaving in "
10141 : : "place.\n");
10142 : 0 : return true;
10143 : : }
10144 : :
10145 : 177437 : gcc_assert (slp_index >= 0);
10146 : :
10147 : : /* Get the last occurrence of the scalar index from the concatenation of
10148 : : all the slp vectors. Calculate which slp vector it is and the index
10149 : : within. */
10150 : 177437 : int num_scalar = SLP_TREE_LANES (slp_node);
10151 : 177437 : int num_vec = vect_get_num_copies (vinfo, slp_node);
10152 : 177437 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10153 : :
10154 : : /* Calculate which vector contains the result, and which lane of
10155 : : that vector we need. */
10156 : 177437 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10157 : : {
10158 : : if (dump_enabled_p ())
10159 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10160 : : "Cannot determine which vector holds the"
10161 : : " final result.\n");
10162 : : return false;
10163 : : }
10164 : :
10165 : 177437 : if (!vec_stmt_p)
10166 : : {
10167 : : /* No transformation required. */
10168 : 138445 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10169 : : {
10170 : 26901 : if (SLP_TREE_LANES (slp_node) != 1)
10171 : : {
10172 : 15 : if (dump_enabled_p ())
10173 : 15 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10174 : : "can't operate on partial vectors "
10175 : : "because an SLP statement is live after "
10176 : : "the loop.\n");
10177 : 15 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10178 : : }
10179 : 26886 : else if (num_vec > 1)
10180 : : {
10181 : 15853 : if (dump_enabled_p ())
10182 : 46 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10183 : : "can't operate on partial vectors "
10184 : : "because ncopies is greater than 1.\n");
10185 : 15853 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10186 : : }
10187 : : else
10188 : : {
10189 : 11033 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10190 : : OPTIMIZE_FOR_SPEED))
10191 : 0 : vect_record_loop_mask (loop_vinfo,
10192 : : &LOOP_VINFO_MASKS (loop_vinfo),
10193 : : 1, vectype, NULL);
10194 : 11033 : else if (can_vec_extract_var_idx_p (
10195 : 11033 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10196 : 0 : vect_record_loop_len (loop_vinfo,
10197 : : &LOOP_VINFO_LENS (loop_vinfo),
10198 : : 1, vectype, 1);
10199 : : else
10200 : : {
10201 : 11033 : if (dump_enabled_p ())
10202 : 517 : dump_printf_loc (
10203 : 517 : MSG_MISSED_OPTIMIZATION, vect_location,
10204 : : "can't operate on partial vectors "
10205 : : "because the target doesn't support extract "
10206 : : "last reduction.\n");
10207 : 11033 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10208 : : }
10209 : : }
10210 : : }
10211 : : /* ??? Enable for loop costing as well. */
10212 : 26901 : if (!loop_vinfo)
10213 : 89069 : record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
10214 : : 0, vect_epilogue);
10215 : 138445 : return true;
10216 : : }
10217 : :
10218 : : /* Use the lhs of the original scalar statement. */
10219 : 38992 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10220 : 38992 : if (dump_enabled_p ())
10221 : 928 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10222 : : "stmt %G", stmt);
10223 : :
10224 : 38992 : lhs = gimple_get_lhs (stmt);
10225 : 38992 : lhs_type = TREE_TYPE (lhs);
10226 : :
10227 : 38992 : bitsize = vector_element_bits_tree (vectype);
10228 : :
10229 : : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10230 : 38992 : gcc_assert (!loop_vinfo
10231 : : || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10232 : : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10233 : : || SLP_TREE_LANES (slp_node) == 1));
10234 : :
10235 : : /* Get the correct slp vectorized stmt. */
10236 : 38992 : tree vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10237 : 38992 : gimple *vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10238 : :
10239 : : /* In case we need to early break vectorize also get the first stmt. */
10240 : 38992 : tree vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10241 : :
10242 : : /* Get entry to use. */
10243 : 38992 : tree bitstart = bitsize_int (vec_index);
10244 : 38992 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10245 : :
10246 : 38992 : if (loop_vinfo)
10247 : : {
10248 : : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10249 : : requirement, insert one phi node for it. It looks like:
10250 : : loop;
10251 : : BB:
10252 : : # lhs' = PHI <lhs>
10253 : : ==>
10254 : : loop;
10255 : : BB:
10256 : : # vec_lhs' = PHI <vec_lhs>
10257 : : new_tree = lane_extract <vec_lhs', ...>;
10258 : : lhs' = new_tree; */
10259 : :
10260 : 2753 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10261 : : /* Check if we have a loop where the chosen exit is not the main exit,
10262 : : in these cases for an early break we restart the iteration the vector code
10263 : : did. For the live values we want the value at the start of the iteration
10264 : : rather than at the end. */
10265 : 2753 : edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10266 : 2753 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10267 : 14380 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10268 : 8874 : if (!is_gimple_debug (use_stmt)
10269 : 8874 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10270 : 2680 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10271 : : {
10272 : 2680 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10273 : 2680 : phi_arg_index_from_use (use_p));
10274 : 2680 : gcc_assert (loop_exit_edge_p (loop, e));
10275 : 2680 : bool main_exit_edge = e == main_e;
10276 : 2680 : tree tmp_vec_lhs = vec_lhs;
10277 : 2680 : tree tmp_bitstart = bitstart;
10278 : :
10279 : : /* For early exit where the exit is not in the BB that leads
10280 : : to the latch then we're restarting the iteration in the
10281 : : scalar loop. So get the first live value. */
10282 : 2680 : bool early_break_first_element_p
10283 : 2680 : = all_exits_as_early_p || !main_exit_edge;
10284 : 2680 : if (early_break_first_element_p)
10285 : : {
10286 : 143 : tmp_vec_lhs = vec_lhs0;
10287 : 143 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10288 : : }
10289 : :
10290 : 2680 : gimple_stmt_iterator exit_gsi;
10291 : 2680 : tree new_tree
10292 : 2680 : = vectorizable_live_operation_1 (loop_vinfo,
10293 : : e->dest, vectype,
10294 : : slp_node, bitsize,
10295 : : tmp_bitstart, tmp_vec_lhs,
10296 : : lhs_type, &exit_gsi);
10297 : :
10298 : 2680 : auto gsi = gsi_for_stmt (use_stmt);
10299 : 2680 : tree lhs_phi = gimple_phi_result (use_stmt);
10300 : 2680 : remove_phi_node (&gsi, false);
10301 : 2680 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10302 : 2680 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10303 : 2680 : break;
10304 : 2753 : }
10305 : :
10306 : : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10307 : 11700 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10308 : 6194 : gcc_assert (is_gimple_debug (use_stmt)
10309 : 2753 : || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10310 : : }
10311 : : else
10312 : : {
10313 : : /* For basic-block vectorization simply insert the lane-extraction. */
10314 : 36239 : tree bftype = TREE_TYPE (vectype);
10315 : 36239 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10316 : 0 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10317 : 36239 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
10318 : : vec_lhs, bitsize, bitstart);
10319 : 36239 : gimple_seq stmts = NULL;
10320 : 36239 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10321 : : &stmts, true, NULL_TREE);
10322 : 36239 : if (TREE_CODE (new_tree) == SSA_NAME
10323 : 72478 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10324 : 2 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10325 : 36239 : if (is_a <gphi *> (vec_stmt))
10326 : : {
10327 : 2617 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10328 : 2617 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10329 : : }
10330 : : else
10331 : : {
10332 : 33622 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10333 : 33622 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10334 : : }
10335 : :
10336 : : /* Replace use of lhs with newly computed result. If the use stmt is a
10337 : : single arg PHI, just replace all uses of PHI result. It's necessary
10338 : : because lcssa PHI defining lhs may be before newly inserted stmt. */
10339 : 36239 : use_operand_p use_p;
10340 : 36239 : stmt_vec_info use_stmt_info;
10341 : 235504 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10342 : 163026 : if (!is_gimple_debug (use_stmt)
10343 : 163026 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10344 : 108688 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10345 : : {
10346 : : /* ??? This can happen when the live lane ends up being
10347 : : rooted in a vector construction code-generated by an
10348 : : external SLP node (and code-generation for that already
10349 : : happened). See gcc.dg/vect/bb-slp-47.c.
10350 : : Doing this is what would happen if that vector CTOR
10351 : : were not code-generated yet so it is not too bad.
10352 : : ??? In fact we'd likely want to avoid this situation
10353 : : in the first place. */
10354 : 63272 : if (TREE_CODE (new_tree) == SSA_NAME
10355 : 63008 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10356 : 63008 : && gimple_code (use_stmt) != GIMPLE_PHI
10357 : 118307 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10358 : : use_stmt))
10359 : : {
10360 : 264 : if (dump_enabled_p ())
10361 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10362 : : "Using original scalar computation for "
10363 : : "live lane because use preceeds vector "
10364 : : "def\n");
10365 : 264 : continue;
10366 : : }
10367 : : /* ??? It can also happen that we end up pulling a def into
10368 : : a loop where replacing out-of-loop uses would require
10369 : : a new LC SSA PHI node. Retain the original scalar in
10370 : : those cases as well. PR98064. */
10371 : 64335 : if (TREE_CODE (new_tree) == SSA_NAME
10372 : 62744 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10373 : 62744 : && (gimple_bb (use_stmt)->loop_father
10374 : 62744 : != gimple_bb (vec_stmt)->loop_father)
10375 : 69793 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10376 : 7049 : gimple_bb (use_stmt)->loop_father))
10377 : : {
10378 : 1591 : if (dump_enabled_p ())
10379 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10380 : : "Using original scalar computation for "
10381 : : "live lane because there is an out-of-loop "
10382 : : "definition for it\n");
10383 : 1591 : continue;
10384 : : }
10385 : 188053 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10386 : 63450 : SET_USE (use_p, new_tree);
10387 : 61153 : update_stmt (use_stmt);
10388 : 36239 : }
10389 : : }
10390 : :
10391 : : return true;
10392 : 234390 : }
10393 : :
10394 : : /* Given loop represented by LOOP_VINFO, return true if computation of
10395 : : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10396 : : otherwise. */
10397 : :
10398 : : static bool
10399 : 60465 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
10400 : : {
10401 : : /* Constant case. */
10402 : 60465 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10403 : : {
10404 : 35164 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10405 : 35164 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10406 : :
10407 : 35164 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10408 : 35164 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10409 : 35164 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10410 : : return true;
10411 : : }
10412 : :
10413 : 25301 : widest_int max;
10414 : 25301 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10415 : : /* Check the upper bound of loop niters. */
10416 : 25301 : if (get_max_loop_iterations (loop, &max))
10417 : : {
10418 : 25301 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10419 : 25301 : signop sgn = TYPE_SIGN (type);
10420 : 25301 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10421 : 25301 : if (max < type_max)
10422 : 25080 : return true;
10423 : 25301 : }
10424 : : return false;
10425 : 25301 : }
10426 : :
10427 : : /* Return a mask type with half the number of elements as OLD_TYPE,
10428 : : given that it should have mode NEW_MODE. */
10429 : :
10430 : : tree
10431 : 3920 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10432 : : {
10433 : 3920 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10434 : 3920 : return build_truth_vector_type_for_mode (nunits, new_mode);
10435 : : }
10436 : :
10437 : : /* Return a mask type with twice as many elements as OLD_TYPE,
10438 : : given that it should have mode NEW_MODE. */
10439 : :
10440 : : tree
10441 : 5911 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10442 : : {
10443 : 5911 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10444 : 5911 : return build_truth_vector_type_for_mode (nunits, new_mode);
10445 : : }
10446 : :
10447 : : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10448 : : contain a sequence of NVECTORS masks that each control a vector of type
10449 : : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10450 : : these vector masks with the vector version of SCALAR_MASK. */
10451 : :
10452 : : void
10453 : 77794 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10454 : : unsigned int nvectors, tree vectype, tree scalar_mask)
10455 : : {
10456 : 77794 : gcc_assert (nvectors != 0);
10457 : :
10458 : 77794 : if (scalar_mask)
10459 : : {
10460 : 3508 : scalar_cond_masked_key cond (scalar_mask, nvectors);
10461 : 3508 : loop_vinfo->scalar_cond_masked_set.add (cond);
10462 : : }
10463 : :
10464 : 77794 : masks->mask_set.add (std::make_pair (vectype, nvectors));
10465 : 77794 : }
10466 : :
10467 : : /* Given a complete set of masks MASKS, extract mask number INDEX
10468 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10469 : : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10470 : :
10471 : : See the comment above vec_loop_masks for more details about the mask
10472 : : arrangement. */
10473 : :
10474 : : tree
10475 : 208 : vect_get_loop_mask (loop_vec_info loop_vinfo,
10476 : : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10477 : : unsigned int nvectors, tree vectype, unsigned int index)
10478 : : {
10479 : 208 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10480 : : == vect_partial_vectors_while_ult)
10481 : : {
10482 : 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10483 : 0 : tree mask_type = rgm->type;
10484 : :
10485 : : /* Populate the rgroup's mask array, if this is the first time we've
10486 : : used it. */
10487 : 0 : if (rgm->controls.is_empty ())
10488 : : {
10489 : 0 : rgm->controls.safe_grow_cleared (nvectors, true);
10490 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
10491 : : {
10492 : 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10493 : : /* Provide a dummy definition until the real one is available. */
10494 : 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10495 : 0 : rgm->controls[i] = mask;
10496 : : }
10497 : : }
10498 : :
10499 : 0 : tree mask = rgm->controls[index];
10500 : 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10501 : 0 : TYPE_VECTOR_SUBPARTS (vectype)))
10502 : : {
10503 : : /* A loop mask for data type X can be reused for data type Y
10504 : : if X has N times more elements than Y and if Y's elements
10505 : : are N times bigger than X's. In this case each sequence
10506 : : of N elements in the loop mask will be all-zero or all-one.
10507 : : We can then view-convert the mask so that each sequence of
10508 : : N elements is replaced by a single element. */
10509 : 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10510 : : TYPE_VECTOR_SUBPARTS (vectype)));
10511 : 0 : gimple_seq seq = NULL;
10512 : 0 : mask_type = truth_type_for (vectype);
10513 : 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10514 : 0 : if (seq)
10515 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10516 : : }
10517 : 0 : return mask;
10518 : : }
10519 : 208 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10520 : : == vect_partial_vectors_avx512)
10521 : : {
10522 : : /* The number of scalars per iteration and the number of vectors are
10523 : : both compile-time constants. */
10524 : 208 : unsigned int nscalars_per_iter
10525 : 208 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10526 : 208 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10527 : :
10528 : 208 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10529 : :
10530 : : /* The stored nV is dependent on the mask type produced. */
10531 : 208 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10532 : : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10533 : : == rgm->factor);
10534 : 208 : nvectors = rgm->factor;
10535 : :
10536 : : /* Populate the rgroup's mask array, if this is the first time we've
10537 : : used it. */
10538 : 208 : if (rgm->controls.is_empty ())
10539 : : {
10540 : 20 : rgm->controls.safe_grow_cleared (nvectors, true);
10541 : 106 : for (unsigned int i = 0; i < nvectors; ++i)
10542 : : {
10543 : 86 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10544 : : /* Provide a dummy definition until the real one is available. */
10545 : 86 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10546 : 86 : rgm->controls[i] = mask;
10547 : : }
10548 : : }
10549 : 208 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10550 : : TYPE_VECTOR_SUBPARTS (vectype)))
10551 : 160 : return rgm->controls[index];
10552 : :
10553 : : /* Split the vector if needed. Since we are dealing with integer mode
10554 : : masks with AVX512 we can operate on the integer representation
10555 : : performing the whole vector shifting. */
10556 : 48 : unsigned HOST_WIDE_INT factor;
10557 : 48 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10558 : 48 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
10559 : 0 : gcc_assert (ok);
10560 : 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10561 : 48 : tree mask_type = truth_type_for (vectype);
10562 : 48 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10563 : 48 : unsigned vi = index / factor;
10564 : 48 : unsigned vpart = index % factor;
10565 : 48 : tree vec = rgm->controls[vi];
10566 : 48 : gimple_seq seq = NULL;
10567 : 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10568 : 48 : lang_hooks.types.type_for_mode
10569 : 48 : (TYPE_MODE (rgm->type), 1), vec);
10570 : : /* For integer mode masks simply shift the right bits into position. */
10571 : 48 : if (vpart != 0)
10572 : 40 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10573 : : build_int_cst (integer_type_node,
10574 : 80 : (TYPE_VECTOR_SUBPARTS (vectype)
10575 : 40 : * vpart)));
10576 : 48 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10577 : 48 : (TYPE_MODE (mask_type), 1), vec);
10578 : 48 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10579 : 48 : if (seq)
10580 : 48 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10581 : 48 : return vec;
10582 : : }
10583 : : else
10584 : 0 : gcc_unreachable ();
10585 : : }
10586 : :
10587 : : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10588 : : lengths for controlling an operation on VECTYPE. The operation splits
10589 : : each element of VECTYPE into FACTOR separate subelements, measuring the
10590 : : length as a number of these subelements. */
10591 : :
10592 : : void
10593 : 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10594 : : unsigned int nvectors, tree vectype, unsigned int factor)
10595 : : {
10596 : 0 : gcc_assert (nvectors != 0);
10597 : 0 : if (lens->length () < nvectors)
10598 : 0 : lens->safe_grow_cleared (nvectors, true);
10599 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10600 : :
10601 : : /* The number of scalars per iteration, scalar occupied bytes and
10602 : : the number of vectors are both compile-time constants. */
10603 : 0 : unsigned int nscalars_per_iter
10604 : 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10605 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10606 : :
10607 : 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10608 : : {
10609 : : /* For now, we only support cases in which all loads and stores fall back
10610 : : to VnQI or none do. */
10611 : 0 : gcc_assert (!rgl->max_nscalars_per_iter
10612 : : || (rgl->factor == 1 && factor == 1)
10613 : : || (rgl->max_nscalars_per_iter * rgl->factor
10614 : : == nscalars_per_iter * factor));
10615 : 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
10616 : 0 : rgl->type = vectype;
10617 : 0 : rgl->factor = factor;
10618 : : }
10619 : 0 : }
10620 : :
10621 : : /* Given a complete set of lengths LENS, extract length number INDEX
10622 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10623 : : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10624 : : multipled by the number of elements that should be processed.
10625 : : Insert any set-up statements before GSI. */
10626 : :
10627 : : tree
10628 : 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10629 : : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10630 : : unsigned int index, unsigned int factor)
10631 : : {
10632 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
10633 : 0 : bool use_bias_adjusted_len =
10634 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10635 : :
10636 : : /* Populate the rgroup's len array, if this is the first time we've
10637 : : used it. */
10638 : 0 : if (rgl->controls.is_empty ())
10639 : : {
10640 : 0 : rgl->controls.safe_grow_cleared (nvectors, true);
10641 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
10642 : : {
10643 : 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10644 : 0 : gcc_assert (len_type != NULL_TREE);
10645 : :
10646 : 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10647 : :
10648 : : /* Provide a dummy definition until the real one is available. */
10649 : 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10650 : 0 : rgl->controls[i] = len;
10651 : :
10652 : 0 : if (use_bias_adjusted_len)
10653 : : {
10654 : 0 : gcc_assert (i == 0);
10655 : 0 : tree adjusted_len =
10656 : 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10657 : 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10658 : 0 : rgl->bias_adjusted_ctrl = adjusted_len;
10659 : : }
10660 : : }
10661 : : }
10662 : :
10663 : 0 : if (use_bias_adjusted_len)
10664 : 0 : return rgl->bias_adjusted_ctrl;
10665 : :
10666 : 0 : tree loop_len = rgl->controls[index];
10667 : 0 : if (rgl->factor == 1 && factor == 1)
10668 : : {
10669 : 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10670 : 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10671 : 0 : if (maybe_ne (nunits1, nunits2))
10672 : : {
10673 : : /* A loop len for data type X can be reused for data type Y
10674 : : if X has N times more elements than Y and if Y's elements
10675 : : are N times bigger than X's. */
10676 : 0 : gcc_assert (multiple_p (nunits1, nunits2));
10677 : 0 : factor = exact_div (nunits1, nunits2).to_constant ();
10678 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10679 : 0 : gimple_seq seq = NULL;
10680 : 0 : loop_len = gimple_build (&seq, EXACT_DIV_EXPR, iv_type, loop_len,
10681 : 0 : build_int_cst (iv_type, factor));
10682 : 0 : if (seq)
10683 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10684 : : }
10685 : : }
10686 : : return loop_len;
10687 : : }
10688 : :
10689 : : /* Generate the tree for the loop len mask and return it. Given the lens,
10690 : : nvectors, vectype, index and factor to gen the len mask as below.
10691 : :
10692 : : tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
10693 : : */
10694 : : tree
10695 : 0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10696 : : gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
10697 : : unsigned int nvectors, tree vectype, tree stmt,
10698 : : unsigned int index, unsigned int factor)
10699 : : {
10700 : 0 : tree all_one_mask = build_all_ones_cst (vectype);
10701 : 0 : tree all_zero_mask = build_zero_cst (vectype);
10702 : 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
10703 : : factor);
10704 : 0 : tree bias = build_int_cst (intQI_type_node,
10705 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
10706 : 0 : tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
10707 : 0 : gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
10708 : : all_one_mask, all_zero_mask, len,
10709 : : bias);
10710 : 0 : gimple_call_set_lhs (call, len_mask);
10711 : 0 : gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
10712 : :
10713 : 0 : return len_mask;
10714 : : }
10715 : :
10716 : : /* Scale profiling counters by estimation for LOOP which is vectorized
10717 : : by factor VF.
10718 : : If FLAT is true, the loop we started with had unrealistically flat
10719 : : profile. */
10720 : :
10721 : : static void
10722 : 60465 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
10723 : : {
10724 : : /* For flat profiles do not scale down proportionally by VF and only
10725 : : cap by known iteration count bounds. */
10726 : 60465 : if (flat)
10727 : : {
10728 : 34054 : if (dump_file && (dump_flags & TDF_DETAILS))
10729 : 5075 : fprintf (dump_file,
10730 : : "Vectorized loop profile seems flat; not scaling iteration "
10731 : : "count down by the vectorization factor %i\n", vf);
10732 : 34054 : scale_loop_profile (loop, profile_probability::always (),
10733 : : get_likely_max_loop_iterations_int (loop));
10734 : 34054 : return;
10735 : : }
10736 : : /* Loop body executes VF fewer times and exit increases VF times. */
10737 : 26411 : profile_count entry_count = loop_preheader_edge (loop)->count ();
10738 : :
10739 : : /* If we have unreliable loop profile avoid dropping entry
10740 : : count below header count. This can happen since loops
10741 : : has unrealistically low trip counts. */
10742 : 26411 : while (vf > 1
10743 : 27692 : && loop->header->count > entry_count
10744 : 56255 : && loop->header->count < entry_count * vf)
10745 : : {
10746 : 2152 : if (dump_file && (dump_flags & TDF_DETAILS))
10747 : 149 : fprintf (dump_file,
10748 : : "Vectorization factor %i seems too large for profile "
10749 : : "prevoiusly believed to be consistent; reducing.\n", vf);
10750 : 2152 : vf /= 2;
10751 : : }
10752 : :
10753 : 26411 : if (entry_count.nonzero_p ())
10754 : 26411 : set_edge_probability_and_rescale_others
10755 : 26411 : (exit_e,
10756 : 26411 : entry_count.probability_in (loop->header->count / vf));
10757 : : /* Avoid producing very large exit probability when we do not have
10758 : : sensible profile. */
10759 : 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
10760 : 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10761 : 26411 : loop->latch->count = single_pred_edge (loop->latch)->count ();
10762 : :
10763 : 26411 : scale_loop_profile (loop, profile_probability::always () / vf,
10764 : : get_likely_max_loop_iterations_int (loop));
10765 : : }
10766 : :
10767 : : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10768 : : original loop that has now been vectorized.
10769 : :
10770 : : The inits of the data_references need to be advanced with the number of
10771 : : iterations of the main loop. This has been computed in vect_do_peeling and
10772 : : is stored in parameter ADVANCE.
10773 : :
10774 : : Since the loop_vec_info of this EPILOGUE was constructed for the original
10775 : : loop, its stmt_vec_infos all point to the original statements. These need
10776 : : to be updated to point to their corresponding copies.
10777 : :
10778 : : The data_reference's connections also need to be updated. Their
10779 : : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10780 : : stmt_vec_infos, their statements need to point to their corresponding
10781 : : copy. */
10782 : :
10783 : : static void
10784 : 6819 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10785 : : {
10786 : 6819 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10787 : 6819 : hash_map<tree,tree> mapping;
10788 : 6819 : gimple *orig_stmt, *new_stmt;
10789 : 6819 : gimple_stmt_iterator epilogue_gsi;
10790 : 6819 : gphi_iterator epilogue_phi_gsi;
10791 : 6819 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10792 : 6819 : basic_block *epilogue_bbs = get_loop_body (epilogue);
10793 : 6819 : unsigned i;
10794 : :
10795 : 6819 : free (LOOP_VINFO_BBS (epilogue_vinfo));
10796 : 6819 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10797 : 6819 : LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
10798 : :
10799 : : /* The EPILOGUE loop is a copy of the original loop so they share the same
10800 : : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10801 : : point to the copied statements. */
10802 : 20457 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10803 : : {
10804 : 13638 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10805 : 35108 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10806 : : {
10807 : 21470 : new_stmt = epilogue_phi_gsi.phi ();
10808 : :
10809 : 21470 : gcc_assert (gimple_uid (new_stmt) > 0);
10810 : 21470 : stmt_vinfo
10811 : 21470 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10812 : :
10813 : 21470 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10814 : : }
10815 : :
10816 : 27276 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10817 : 135841 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10818 : : {
10819 : 122203 : new_stmt = gsi_stmt (epilogue_gsi);
10820 : 122203 : if (is_gimple_debug (new_stmt))
10821 : 21878 : continue;
10822 : :
10823 : 100325 : gcc_assert (gimple_uid (new_stmt) > 0);
10824 : 100325 : stmt_vinfo
10825 : 100325 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10826 : :
10827 : 100325 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10828 : :
10829 : 100325 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10830 : 100325 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10831 : : {
10832 : 1876 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10833 : : /* Set BB such that the assert in
10834 : : 'get_initial_defs_for_reduction' is able to determine that
10835 : : the BB of the related stmt is inside this loop. */
10836 : 1876 : gimple_set_bb (stmt,
10837 : : gimple_bb (new_stmt));
10838 : 1876 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10839 : 1876 : gcc_assert (related_vinfo == NULL
10840 : : || related_vinfo == stmt_vinfo);
10841 : : }
10842 : : }
10843 : : }
10844 : :
10845 : 6819 : struct data_reference *dr;
10846 : 6819 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10847 : 29064 : FOR_EACH_VEC_ELT (datarefs, i, dr)
10848 : : {
10849 : 22245 : orig_stmt = DR_STMT (dr);
10850 : 22245 : gcc_assert (gimple_uid (orig_stmt) > 0);
10851 : 22245 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10852 : 22245 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10853 : : }
10854 : :
10855 : : /* Advance data_reference's with the number of iterations of the previous
10856 : : loop and its prologue. */
10857 : 6819 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10858 : :
10859 : : /* Remember the advancement made. */
10860 : 6819 : LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
10861 : 6819 : }
10862 : :
10863 : : /* When vectorizing early break statements instructions that happen before
10864 : : the early break in the current BB need to be moved to after the early
10865 : : break. This function deals with that and assumes that any validity
10866 : : checks has already been performed.
10867 : :
10868 : : While moving the instructions if it encounters a VUSE or VDEF it then
10869 : : corrects the VUSES as it moves the statements along. GDEST is the location
10870 : : in which to insert the new statements. */
10871 : :
10872 : : static void
10873 : 1482 : move_early_exit_stmts (loop_vec_info loop_vinfo)
10874 : : {
10875 : 1482 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
10876 : :
10877 : 1482 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
10878 : 1270 : return;
10879 : :
10880 : : /* Move all stmts that need moving. */
10881 : 212 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
10882 : 212 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
10883 : :
10884 : 212 : tree last_seen_vuse = NULL_TREE;
10885 : 523 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
10886 : : {
10887 : : /* We have to update crossed degenerate virtual PHIs. Simply
10888 : : elide them. */
10889 : 311 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
10890 : : {
10891 : 7 : tree vdef = gimple_phi_result (vphi);
10892 : 7 : tree vuse = gimple_phi_arg_def (vphi, 0);
10893 : 7 : imm_use_iterator iter;
10894 : 7 : use_operand_p use_p;
10895 : 7 : gimple *use_stmt;
10896 : 30 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
10897 : : {
10898 : 48 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
10899 : 16 : SET_USE (use_p, vuse);
10900 : 7 : }
10901 : 7 : auto gsi = gsi_for_stmt (stmt);
10902 : 7 : remove_phi_node (&gsi, true);
10903 : 7 : last_seen_vuse = vuse;
10904 : 7 : continue;
10905 : 7 : }
10906 : :
10907 : : /* Check to see if statement is still required for vect or has been
10908 : : elided. */
10909 : 304 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
10910 : 304 : if (!stmt_info)
10911 : 0 : continue;
10912 : :
10913 : 304 : if (dump_enabled_p ())
10914 : 153 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
10915 : :
10916 : 304 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
10917 : 304 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
10918 : 608 : last_seen_vuse = gimple_vuse (stmt);
10919 : : }
10920 : :
10921 : : /* Update all the stmts with their new reaching VUSES. */
10922 : 654 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
10923 : : {
10924 : 184 : if (dump_enabled_p ())
10925 : 148 : dump_printf_loc (MSG_NOTE, vect_location,
10926 : : "updating vuse to %T for load %G",
10927 : : last_seen_vuse, p);
10928 : 184 : gimple_set_vuse (p, last_seen_vuse);
10929 : 184 : update_stmt (p);
10930 : : }
10931 : :
10932 : : /* And update the LC PHIs on exits. */
10933 : 1066 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10934 : 430 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
10935 : 226 : if (gphi *phi = get_virtual_phi (e->dest))
10936 : 438 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
10937 : : }
10938 : :
10939 : : /* Generate adjustment code for early break scalar IVs filling in the value
10940 : : we created earlier on for LOOP_VINFO_EARLY_BRK_NITERS_VAR. */
10941 : :
10942 : : static void
10943 : 1482 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vec_info loop_vinfo)
10944 : : {
10945 : 1482 : DUMP_VECT_SCOPE ("vect_update_ivs_after_vectorizer_for_early_breaks");
10946 : :
10947 : 1482 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10948 : 0 : return;
10949 : :
10950 : 1482 : gcc_assert (LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo));
10951 : :
10952 : 1482 : tree phi_var = LOOP_VINFO_EARLY_BRK_NITERS_VAR (loop_vinfo);
10953 : 1482 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10954 : 1482 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10955 : 1482 : tree ty_var = TREE_TYPE (phi_var);
10956 : 1482 : auto loop = LOOP_VINFO_LOOP (loop_vinfo);
10957 : 1482 : tree induc_var = niters_skip ? copy_ssa_name (phi_var) : phi_var;
10958 : :
10959 : 1482 : auto induction_phi = create_phi_node (induc_var, loop->header);
10960 : 1482 : tree induc_def = PHI_RESULT (induction_phi);
10961 : :
10962 : : /* Create the iv update inside the loop. */
10963 : 1482 : gimple_seq init_stmts = NULL;
10964 : 1482 : gimple_seq stmts = NULL;
10965 : 1482 : gimple_seq iv_stmts = NULL;
10966 : 1482 : tree tree_vf = build_int_cst (ty_var, vf);
10967 : :
10968 : : /* For loop len targets we have to use .SELECT_VL (ivtmp_33, VF); instead of
10969 : : just += VF as the VF can change in between two loop iterations. */
10970 : 1482 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10971 : : {
10972 : 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10973 : 0 : tree_vf = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
10974 : : NULL_TREE, 0, 0);
10975 : : }
10976 : :
10977 : 1482 : tree iter_var;
10978 : 1482 : if (POINTER_TYPE_P (ty_var))
10979 : : {
10980 : 0 : tree offset = gimple_convert (&stmts, sizetype, tree_vf);
10981 : 0 : iter_var = gimple_build (&stmts, POINTER_PLUS_EXPR, ty_var, induc_def,
10982 : : gimple_convert (&stmts, sizetype, offset));
10983 : : }
10984 : : else
10985 : : {
10986 : 1482 : tree offset = gimple_convert (&stmts, ty_var, tree_vf);
10987 : 1482 : iter_var = gimple_build (&stmts, PLUS_EXPR, ty_var, induc_def, offset);
10988 : : }
10989 : :
10990 : 1482 : tree init_var = build_zero_cst (ty_var);
10991 : 1482 : if (niters_skip)
10992 : 0 : init_var = gimple_build (&init_stmts, MINUS_EXPR, ty_var, init_var,
10993 : : gimple_convert (&init_stmts, ty_var, niters_skip));
10994 : :
10995 : 1482 : add_phi_arg (induction_phi, iter_var,
10996 : : loop_latch_edge (loop), UNKNOWN_LOCATION);
10997 : 1482 : add_phi_arg (induction_phi, init_var,
10998 : : loop_preheader_edge (loop), UNKNOWN_LOCATION);
10999 : :
11000 : : /* Find the first insertion point in the BB. */
11001 : 1482 : auto pe = loop_preheader_edge (loop);
11002 : :
11003 : : /* If we've done any peeling, calculate the peeling adjustment needed to the
11004 : : final IV. */
11005 : 1482 : if (niters_skip)
11006 : : {
11007 : 0 : induc_def = gimple_build (&iv_stmts, MAX_EXPR, TREE_TYPE (induc_def),
11008 : : induc_def,
11009 : 0 : build_zero_cst (TREE_TYPE (induc_def)));
11010 : 0 : auto stmt = gimple_build_assign (phi_var, induc_def);
11011 : 0 : gimple_seq_add_stmt_without_update (&iv_stmts, stmt);
11012 : 0 : basic_block exit_bb = NULL;
11013 : : /* Identify the early exit merge block. I wish we had stored this. */
11014 : 0 : for (auto e : get_loop_exit_edges (loop))
11015 : 0 : if (e != LOOP_VINFO_IV_EXIT (loop_vinfo))
11016 : : {
11017 : 0 : exit_bb = e->dest;
11018 : 0 : break;
11019 : 0 : }
11020 : :
11021 : 0 : gcc_assert (exit_bb);
11022 : 0 : auto exit_gsi = gsi_after_labels (exit_bb);
11023 : 0 : gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
11024 : : }
11025 : : /* Write the init_stmts in the loop-preheader block. */
11026 : 1482 : auto psi = gsi_last_nondebug_bb (pe->src);
11027 : 1482 : gsi_insert_seq_after (&psi, init_stmts, GSI_LAST_NEW_STMT);
11028 : : /* Wite the adjustments in the header block. */
11029 : 1482 : basic_block bb = loop->header;
11030 : 1482 : auto si = gsi_after_labels (bb);
11031 : 1482 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11032 : : }
11033 : :
11034 : : /* Function vect_transform_loop.
11035 : :
11036 : : The analysis phase has determined that the loop is vectorizable.
11037 : : Vectorize the loop - created vectorized stmts to replace the scalar
11038 : : stmts in the loop, and update the loop exit condition.
11039 : : Returns scalar epilogue loop if any. */
11040 : :
11041 : : class loop *
11042 : 60465 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11043 : : {
11044 : 60465 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11045 : 60465 : class loop *epilogue = NULL;
11046 : 60465 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11047 : 60465 : int nbbs = loop->num_nodes;
11048 : 60465 : int i;
11049 : 60465 : tree niters_vector = NULL_TREE;
11050 : 60465 : tree step_vector = NULL_TREE;
11051 : 60465 : tree niters_vector_mult_vf = NULL_TREE;
11052 : 60465 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11053 : 60465 : unsigned int lowest_vf = constant_lower_bound (vf);
11054 : 60465 : gimple *stmt;
11055 : 60465 : bool check_profitability = false;
11056 : 60465 : unsigned int th;
11057 : 60465 : bool flat = maybe_flat_loop_profile (loop);
11058 : :
11059 : 60465 : DUMP_VECT_SCOPE ("vec_transform_loop");
11060 : :
11061 : 60465 : if (! LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11062 : 53646 : loop_vinfo->shared->check_datarefs ();
11063 : :
11064 : : /* Use the more conservative vectorization threshold. If the number
11065 : : of iterations is constant assume the cost check has been performed
11066 : : by our caller. If the threshold makes all loops profitable that
11067 : : run at least the (estimated) vectorization factor number of times
11068 : : checking is pointless, too. */
11069 : 60465 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11070 : 60465 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11071 : : {
11072 : 18341 : if (dump_enabled_p ())
11073 : 172 : dump_printf_loc (MSG_NOTE, vect_location,
11074 : : "Profitability threshold is %d loop iterations.\n",
11075 : : th);
11076 : : check_profitability = true;
11077 : : }
11078 : :
11079 : : /* Make sure there exists a single-predecessor exit bb. Do this before
11080 : : versioning. */
11081 : 60465 : edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11082 : 60465 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11083 : : {
11084 : 18732 : split_loop_exit_edge (e, true);
11085 : 18732 : if (dump_enabled_p ())
11086 : 2223 : dump_printf (MSG_NOTE, "split exit edge\n");
11087 : : }
11088 : :
11089 : : /* Version the loop first, if required, so the profitability check
11090 : : comes first. */
11091 : :
11092 : 60465 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11093 : : {
11094 : 3704 : class loop *sloop
11095 : 3704 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11096 : 3704 : sloop->force_vectorize = false;
11097 : 3704 : check_profitability = false;
11098 : : }
11099 : :
11100 : : /* Make sure there exists a single-predecessor exit bb also on the
11101 : : scalar loop copy. Do this after versioning but before peeling
11102 : : so CFG structure is fine for both scalar and if-converted loop
11103 : : to make slpeel_duplicate_current_defs_from_edges face matched
11104 : : loop closed PHI nodes on the exit. */
11105 : 60465 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11106 : : {
11107 : 7986 : e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11108 : 7986 : if (! single_pred_p (e->dest))
11109 : : {
11110 : 7723 : split_loop_exit_edge (e, true);
11111 : 7723 : if (dump_enabled_p ())
11112 : 1121 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11113 : : }
11114 : : }
11115 : :
11116 : 60465 : tree niters = vect_build_loop_niters (loop_vinfo);
11117 : 60465 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11118 : 60465 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11119 : 60465 : bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11120 : 60465 : tree advance;
11121 : 60465 : drs_init_vec orig_drs_init;
11122 : :
11123 : 60465 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11124 : : &step_vector, &niters_vector_mult_vf, th,
11125 : : check_profitability, niters_no_overflow,
11126 : : &advance);
11127 : 60465 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11128 : 60465 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11129 : : {
11130 : : /* Ifcvt duplicates loop preheader, loop body and produces an basic
11131 : : block after loop exit. We need to scale all that. */
11132 : 89 : basic_block preheader
11133 : 89 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11134 : 89 : preheader->count
11135 : : = preheader->count.apply_probability
11136 : 89 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11137 : 89 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11138 : : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11139 : 89 : LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11140 : : }
11141 : :
11142 : 60465 : if (niters_vector == NULL_TREE)
11143 : : {
11144 : 26711 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11145 : 26711 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11146 : 54164 : && known_eq (lowest_vf, vf))
11147 : : {
11148 : 26708 : niters_vector
11149 : 26708 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11150 : 26708 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11151 : 26708 : step_vector = build_one_cst (TREE_TYPE (niters));
11152 : : }
11153 : 748 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11154 : 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11155 : : &step_vector, niters_no_overflow);
11156 : : else
11157 : : /* vect_do_peeling subtracted the number of peeled prologue
11158 : : iterations from LOOP_VINFO_NITERS. */
11159 : 747 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11160 : : &niters_vector, &step_vector,
11161 : : niters_no_overflow);
11162 : : }
11163 : :
11164 : : /* 1) Make sure the loop header has exactly two entries
11165 : : 2) Make sure we have a preheader basic block. */
11166 : :
11167 : 60465 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11168 : :
11169 : 60465 : split_edge (loop_preheader_edge (loop));
11170 : :
11171 : 60465 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11172 : : /* This will deal with any possible peeling. */
11173 : 1 : vect_prepare_for_masked_peels (loop_vinfo);
11174 : :
11175 : : /* Handle any code motion that we need to for early-break vectorization after
11176 : : we've done peeling but just before we start vectorizing. */
11177 : 60465 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11178 : : {
11179 : 1482 : vect_update_ivs_after_vectorizer_for_early_breaks (loop_vinfo);
11180 : 1482 : move_early_exit_stmts (loop_vinfo);
11181 : : }
11182 : :
11183 : : /* Remove existing clobber stmts and prefetches. */
11184 : 184631 : for (i = 0; i < nbbs; i++)
11185 : : {
11186 : 124166 : basic_block bb = bbs[i];
11187 : 1068704 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
11188 : : {
11189 : 820372 : stmt = gsi_stmt (si);
11190 : 820372 : if (gimple_clobber_p (stmt)
11191 : 820372 : || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
11192 : : {
11193 : 88 : unlink_stmt_vdef (stmt);
11194 : 88 : gsi_remove (&si, true);
11195 : 88 : release_defs (stmt);
11196 : : }
11197 : : else
11198 : 820284 : gsi_next (&si);
11199 : : }
11200 : : }
11201 : :
11202 : : /* Schedule the SLP instances. */
11203 : 60465 : if (!loop_vinfo->slp_instances.is_empty ())
11204 : : {
11205 : 60465 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11206 : 60465 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11207 : : }
11208 : :
11209 : : /* Generate the loop invariant statements. */
11210 : 60465 : if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
11211 : : {
11212 : 73 : if (dump_enabled_p ())
11213 : 30 : dump_printf_loc (MSG_NOTE, vect_location,
11214 : : "------>generating loop invariant statements\n");
11215 : 73 : gimple_stmt_iterator gsi;
11216 : 73 : gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
11217 : 73 : gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
11218 : : GSI_CONTINUE_LINKING);
11219 : : }
11220 : :
11221 : : /* Stub out scalar statements that must not survive vectorization and
11222 : : were not picked as relevant in any SLP instance.
11223 : : Doing this here helps with grouped statements, or statements that
11224 : : are involved in patterns. */
11225 : 184631 : for (i = 0; i < nbbs; i++)
11226 : : {
11227 : 124166 : basic_block bb = bbs[i];
11228 : 124166 : stmt_vec_info stmt_info;
11229 : 248332 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11230 : 1627753 : !gsi_end_p (gsi); gsi_next (&gsi))
11231 : : {
11232 : 1503587 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11233 : 6231 : if (!call || !gimple_call_internal_p (call))
11234 : 1498506 : continue;
11235 : 5081 : internal_fn ifn = gimple_call_internal_fn (call);
11236 : 5081 : if (ifn == IFN_MASK_LOAD)
11237 : : {
11238 : 660 : tree lhs = gimple_get_lhs (call);
11239 : 660 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11240 : : {
11241 : 0 : tree zero = build_zero_cst (TREE_TYPE (lhs));
11242 : 0 : gimple *new_stmt = gimple_build_assign (lhs, zero);
11243 : 0 : gsi_replace (&gsi, new_stmt, true);
11244 : : }
11245 : : }
11246 : 4421 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11247 : : {
11248 : 2295 : tree lhs = gimple_get_lhs (call);
11249 : 2295 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11250 : : {
11251 : 0 : tree else_arg
11252 : 0 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11253 : 0 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11254 : 0 : gsi_replace (&gsi, new_stmt, true);
11255 : : }
11256 : : }
11257 : 2126 : else if (ifn == IFN_MASK_CALL
11258 : 4 : && (stmt_info = loop_vinfo->lookup_stmt (call))
11259 : 4 : && !STMT_VINFO_RELEVANT_P (stmt_info)
11260 : 2130 : && !STMT_VINFO_LIVE_P (stmt_info))
11261 : : {
11262 : 4 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11263 : 4 : loop_vinfo->remove_stmt (stmt_info);
11264 : : }
11265 : : }
11266 : : }
11267 : :
11268 : : /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11269 : : a zero NITERS becomes a nonzero NITERS_VECTOR. */
11270 : 60465 : if (integer_onep (step_vector))
11271 : 60447 : niters_no_overflow = true;
11272 : 60465 : vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11273 : : niters_vector, step_vector, niters_vector_mult_vf,
11274 : 60465 : !niters_no_overflow);
11275 : :
11276 : 60465 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11277 : :
11278 : : /* True if the final iteration might not handle a full vector's
11279 : : worth of scalar iterations. */
11280 : 120930 : bool final_iter_may_be_partial
11281 : 60465 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11282 : 60465 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
11283 : :
11284 : : /* +1 to convert latch counts to loop iteration counts. */
11285 : 60465 : int bias_for_lowest = 1;
11286 : :
11287 : : /* When we are peeling for gaps then we take away one scalar iteration
11288 : : from the vector loop. Thus we can adjust the upper bound by one
11289 : : scalar iteration. But only when we know the bound applies to the
11290 : : IV exit test which might not be true when we have multiple exits. */
11291 : 60465 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11292 : 117613 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11293 : :
11294 : 60465 : int bias_for_assumed = bias_for_lowest;
11295 : 60465 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11296 : 60465 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11297 : : {
11298 : : /* When the amount of peeling is known at compile time, the first
11299 : : iteration will have exactly alignment_npeels active elements.
11300 : : In the worst case it will have at least one. */
11301 : 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11302 : 1 : bias_for_lowest += lowest_vf - min_first_active;
11303 : 1 : bias_for_assumed += assumed_vf - min_first_active;
11304 : : }
11305 : : /* In these calculations the "- 1" converts loop iteration counts
11306 : : back to latch counts. */
11307 : 60465 : if (loop->any_upper_bound)
11308 : : {
11309 : 60465 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11310 : 60465 : loop->nb_iterations_upper_bound
11311 : 60465 : = (final_iter_may_be_partial
11312 : 61965 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11313 : 3000 : lowest_vf) - 1
11314 : 58965 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11315 : 117930 : lowest_vf) - 1);
11316 : 60465 : if (main_vinfo
11317 : : /* Both peeling for alignment and peeling for gaps can end up
11318 : : with the scalar epilogue running for more than VF-1 iterations. */
11319 : 6819 : && !main_vinfo->peeling_for_alignment
11320 : 6771 : && !main_vinfo->peeling_for_gaps)
11321 : : {
11322 : 6608 : unsigned int bound;
11323 : 6608 : poly_uint64 main_iters
11324 : 6608 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11325 : : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11326 : 6608 : main_iters
11327 : 6608 : = upper_bound (main_iters,
11328 : 6608 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11329 : 13216 : if (can_div_away_from_zero_p (main_iters,
11330 : 6608 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11331 : : &bound))
11332 : 6608 : loop->nb_iterations_upper_bound
11333 : 6608 : = wi::umin ((bound_wide_int) (bound - 1),
11334 : 6608 : loop->nb_iterations_upper_bound);
11335 : : }
11336 : : }
11337 : 60465 : if (loop->any_likely_upper_bound)
11338 : 60465 : loop->nb_iterations_likely_upper_bound
11339 : 60465 : = (final_iter_may_be_partial
11340 : 61965 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11341 : 1500 : + bias_for_lowest, lowest_vf) - 1
11342 : 58965 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11343 : 60465 : + bias_for_lowest, lowest_vf) - 1);
11344 : 60465 : if (loop->any_estimate)
11345 : 34798 : loop->nb_iterations_estimate
11346 : 34798 : = (final_iter_may_be_partial
11347 : 35598 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11348 : 1600 : assumed_vf) - 1
11349 : 33998 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11350 : 68796 : assumed_vf) - 1);
11351 : 60465 : scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11352 : : assumed_vf, flat);
11353 : :
11354 : 60465 : if (dump_enabled_p ())
11355 : : {
11356 : 10476 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11357 : : {
11358 : 9095 : dump_printf_loc (MSG_NOTE, vect_location,
11359 : : "LOOP VECTORIZED\n");
11360 : 9095 : if (loop->inner)
11361 : 286 : dump_printf_loc (MSG_NOTE, vect_location,
11362 : : "OUTER LOOP VECTORIZED\n");
11363 : 9095 : dump_printf (MSG_NOTE, "\n");
11364 : : }
11365 : : else
11366 : 1381 : dump_printf_loc (MSG_NOTE, vect_location,
11367 : : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11368 : 1381 : GET_MODE_NAME (loop_vinfo->vector_mode));
11369 : : }
11370 : :
11371 : : /* Loops vectorized with a variable factor won't benefit from
11372 : : unrolling/peeling. */
11373 : 60465 : if (!vf.is_constant ())
11374 : : {
11375 : : loop->unroll = 1;
11376 : : if (dump_enabled_p ())
11377 : : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11378 : : " variable-length vectorization factor\n");
11379 : : }
11380 : :
11381 : : /* When we have unrolled the loop due to a user requested value we should
11382 : : leave it up to the RTL unroll heuristics to determine if it's still worth
11383 : : while to unroll more. */
11384 : 60465 : if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
11385 : 44 : loop->unroll = 0;
11386 : :
11387 : : /* Free SLP instances here because otherwise stmt reference counting
11388 : : won't work. */
11389 : : slp_instance instance;
11390 : 148845 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11391 : 88380 : vect_free_slp_instance (instance);
11392 : 60465 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11393 : : /* Clear-up safelen field since its value is invalid after vectorization
11394 : : since vectorized loop can have loop-carried dependencies. */
11395 : 60465 : loop->safelen = 0;
11396 : :
11397 : 60465 : if (epilogue)
11398 : : {
11399 : : /* Accumulate past advancements made. */
11400 : 6819 : if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
11401 : 89 : advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
11402 : : LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
11403 : : advance);
11404 : 6819 : update_epilogue_loop_vinfo (epilogue, advance);
11405 : :
11406 : 6819 : epilogue->simduid = loop->simduid;
11407 : 6819 : epilogue->force_vectorize = loop->force_vectorize;
11408 : 6819 : epilogue->dont_vectorize = false;
11409 : : }
11410 : :
11411 : 60465 : return epilogue;
11412 : 60465 : }
11413 : :
11414 : : /* The code below is trying to perform simple optimization - revert
11415 : : if-conversion for masked stores, i.e. if the mask of a store is zero
11416 : : do not perform it and all stored value producers also if possible.
11417 : : For example,
11418 : : for (i=0; i<n; i++)
11419 : : if (c[i])
11420 : : {
11421 : : p1[i] += 1;
11422 : : p2[i] = p3[i] +2;
11423 : : }
11424 : : this transformation will produce the following semi-hammock:
11425 : :
11426 : : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11427 : : {
11428 : : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11429 : : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11430 : : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11431 : : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11432 : : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11433 : : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11434 : : }
11435 : : */
11436 : :
11437 : : void
11438 : 497 : optimize_mask_stores (class loop *loop)
11439 : : {
11440 : 497 : basic_block *bbs = get_loop_body (loop);
11441 : 497 : unsigned nbbs = loop->num_nodes;
11442 : 497 : unsigned i;
11443 : 497 : basic_block bb;
11444 : 497 : class loop *bb_loop;
11445 : 497 : gimple_stmt_iterator gsi;
11446 : 497 : gimple *stmt;
11447 : 497 : auto_vec<gimple *> worklist;
11448 : 497 : auto_purge_vect_location sentinel;
11449 : :
11450 : 497 : vect_location = find_loop_location (loop);
11451 : : /* Pick up all masked stores in loop if any. */
11452 : 1988 : for (i = 0; i < nbbs; i++)
11453 : : {
11454 : 994 : bb = bbs[i];
11455 : 16368 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11456 : 14380 : gsi_next (&gsi))
11457 : : {
11458 : 14380 : stmt = gsi_stmt (gsi);
11459 : 14380 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11460 : 651 : worklist.safe_push (stmt);
11461 : : }
11462 : : }
11463 : :
11464 : 497 : free (bbs);
11465 : 497 : if (worklist.is_empty ())
11466 : 68 : return;
11467 : :
11468 : : /* Loop has masked stores. */
11469 : 1063 : while (!worklist.is_empty ())
11470 : : {
11471 : 634 : gimple *last, *last_store;
11472 : 634 : edge e, efalse;
11473 : 634 : tree mask;
11474 : 634 : basic_block store_bb, join_bb;
11475 : 634 : gimple_stmt_iterator gsi_to;
11476 : 634 : tree vdef, new_vdef;
11477 : 634 : gphi *phi;
11478 : 634 : tree vectype;
11479 : 634 : tree zero;
11480 : :
11481 : 634 : last = worklist.pop ();
11482 : 634 : mask = gimple_call_arg (last, 2);
11483 : 634 : bb = gimple_bb (last);
11484 : : /* Create then_bb and if-then structure in CFG, then_bb belongs to
11485 : : the same loop as if_bb. It could be different to LOOP when two
11486 : : level loop-nest is vectorized and mask_store belongs to the inner
11487 : : one. */
11488 : 634 : e = split_block (bb, last);
11489 : 634 : bb_loop = bb->loop_father;
11490 : 634 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11491 : 634 : join_bb = e->dest;
11492 : 634 : store_bb = create_empty_bb (bb);
11493 : 634 : add_bb_to_loop (store_bb, bb_loop);
11494 : 634 : e->flags = EDGE_TRUE_VALUE;
11495 : 634 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11496 : : /* Put STORE_BB to likely part. */
11497 : 634 : efalse->probability = profile_probability::likely ();
11498 : 634 : e->probability = efalse->probability.invert ();
11499 : 634 : store_bb->count = efalse->count ();
11500 : 634 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11501 : 634 : if (dom_info_available_p (CDI_DOMINATORS))
11502 : 634 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11503 : 634 : if (dump_enabled_p ())
11504 : 301 : dump_printf_loc (MSG_NOTE, vect_location,
11505 : : "Create new block %d to sink mask stores.",
11506 : : store_bb->index);
11507 : : /* Create vector comparison with boolean result. */
11508 : 634 : vectype = TREE_TYPE (mask);
11509 : 634 : zero = build_zero_cst (vectype);
11510 : 634 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11511 : 634 : gsi = gsi_last_bb (bb);
11512 : 634 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11513 : : /* Create new PHI node for vdef of the last masked store:
11514 : : .MEM_2 = VDEF <.MEM_1>
11515 : : will be converted to
11516 : : .MEM.3 = VDEF <.MEM_1>
11517 : : and new PHI node will be created in join bb
11518 : : .MEM_2 = PHI <.MEM_1, .MEM_3>
11519 : : */
11520 : 634 : vdef = gimple_vdef (last);
11521 : 634 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
11522 : 634 : gimple_set_vdef (last, new_vdef);
11523 : 634 : phi = create_phi_node (vdef, join_bb);
11524 : 634 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11525 : :
11526 : : /* Put all masked stores with the same mask to STORE_BB if possible. */
11527 : 668 : while (true)
11528 : : {
11529 : 651 : gimple_stmt_iterator gsi_from;
11530 : 651 : gimple *stmt1 = NULL;
11531 : :
11532 : : /* Move masked store to STORE_BB. */
11533 : 651 : last_store = last;
11534 : 651 : gsi = gsi_for_stmt (last);
11535 : 651 : gsi_from = gsi;
11536 : : /* Shift GSI to the previous stmt for further traversal. */
11537 : 651 : gsi_prev (&gsi);
11538 : 651 : gsi_to = gsi_start_bb (store_bb);
11539 : 651 : gsi_move_before (&gsi_from, &gsi_to);
11540 : : /* Setup GSI_TO to the non-empty block start. */
11541 : 651 : gsi_to = gsi_start_bb (store_bb);
11542 : 651 : if (dump_enabled_p ())
11543 : 317 : dump_printf_loc (MSG_NOTE, vect_location,
11544 : : "Move stmt to created bb\n%G", last);
11545 : : /* Move all stored value producers if possible. */
11546 : 4451 : while (!gsi_end_p (gsi))
11547 : : {
11548 : 4450 : tree lhs;
11549 : 4450 : imm_use_iterator imm_iter;
11550 : 4450 : use_operand_p use_p;
11551 : 4450 : bool res;
11552 : :
11553 : : /* Skip debug statements. */
11554 : 4450 : if (is_gimple_debug (gsi_stmt (gsi)))
11555 : : {
11556 : 3 : gsi_prev (&gsi);
11557 : 2786 : continue;
11558 : : }
11559 : 4447 : stmt1 = gsi_stmt (gsi);
11560 : : /* Do not consider statements writing to memory or having
11561 : : volatile operand. */
11562 : 8774 : if (gimple_vdef (stmt1)
11563 : 8774 : || gimple_has_volatile_ops (stmt1))
11564 : : break;
11565 : 4327 : gsi_from = gsi;
11566 : 4327 : gsi_prev (&gsi);
11567 : 4327 : lhs = gimple_get_lhs (stmt1);
11568 : 4327 : if (!lhs)
11569 : : break;
11570 : :
11571 : : /* LHS of vectorized stmt must be SSA_NAME. */
11572 : 4327 : if (TREE_CODE (lhs) != SSA_NAME)
11573 : : break;
11574 : :
11575 : 4327 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11576 : : {
11577 : : /* Remove dead scalar statement. */
11578 : 3075 : if (has_zero_uses (lhs))
11579 : : {
11580 : 2783 : gsi_remove (&gsi_from, true);
11581 : 2783 : release_defs (stmt1);
11582 : 2783 : continue;
11583 : : }
11584 : : }
11585 : :
11586 : : /* Check that LHS does not have uses outside of STORE_BB. */
11587 : 1544 : res = true;
11588 : 4193 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11589 : : {
11590 : 1635 : gimple *use_stmt;
11591 : 1635 : use_stmt = USE_STMT (use_p);
11592 : 1635 : if (is_gimple_debug (use_stmt))
11593 : 0 : continue;
11594 : 1635 : if (gimple_bb (use_stmt) != store_bb)
11595 : : {
11596 : : res = false;
11597 : : break;
11598 : : }
11599 : 1544 : }
11600 : 1544 : if (!res)
11601 : : break;
11602 : :
11603 : 1014 : if (gimple_vuse (stmt1)
11604 : 1450 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
11605 : : break;
11606 : :
11607 : : /* Can move STMT1 to STORE_BB. */
11608 : 1014 : if (dump_enabled_p ())
11609 : 533 : dump_printf_loc (MSG_NOTE, vect_location,
11610 : : "Move stmt to created bb\n%G", stmt1);
11611 : 1014 : gsi_move_before (&gsi_from, &gsi_to);
11612 : : /* Shift GSI_TO for further insertion. */
11613 : 2028 : gsi_prev (&gsi_to);
11614 : : }
11615 : : /* Put other masked stores with the same mask to STORE_BB. */
11616 : 651 : if (worklist.is_empty ()
11617 : 222 : || gimple_call_arg (worklist.last (), 2) != mask
11618 : 17 : || worklist.last () != stmt1)
11619 : : break;
11620 : 17 : last = worklist.pop ();
11621 : 17 : }
11622 : 1268 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11623 : : }
11624 : 497 : }
11625 : :
11626 : : /* Decide whether it is possible to use a zero-based induction variable
11627 : : when vectorizing LOOP_VINFO with partial vectors. If it is, return
11628 : : the value that the induction variable must be able to hold in order
11629 : : to ensure that the rgroups eventually have no active vector elements.
11630 : : Return -1 otherwise. */
11631 : :
11632 : : widest_int
11633 : 33640 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11634 : : {
11635 : 33640 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11636 : 33640 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11637 : 33640 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11638 : :
11639 : : /* Calculate the value that the induction variable must be able
11640 : : to hit in order to ensure that we end the loop with an all-false mask.
11641 : : This involves adding the maximum number of inactive trailing scalar
11642 : : iterations. */
11643 : 33640 : widest_int iv_limit = -1;
11644 : 33640 : if (max_loop_iterations (loop, &iv_limit))
11645 : : {
11646 : 33640 : if (niters_skip)
11647 : : {
11648 : : /* Add the maximum number of skipped iterations to the
11649 : : maximum iteration count. */
11650 : 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
11651 : 0 : iv_limit += wi::to_widest (niters_skip);
11652 : : else
11653 : 0 : iv_limit += max_vf - 1;
11654 : : }
11655 : 33640 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11656 : : /* Make a conservatively-correct assumption. */
11657 : 336 : iv_limit += max_vf - 1;
11658 : :
11659 : : /* IV_LIMIT is the maximum number of latch iterations, which is also
11660 : : the maximum in-range IV value. Round this value down to the previous
11661 : : vector alignment boundary and then add an extra full iteration. */
11662 : 33640 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11663 : 33640 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11664 : : }
11665 : 33640 : return iv_limit;
11666 : : }
11667 : :
11668 : : /* For the given rgroup_controls RGC, check whether an induction variable
11669 : : would ever hit a value that produces a set of all-false masks or zero
11670 : : lengths before wrapping around. Return true if it's possible to wrap
11671 : : around before hitting the desirable value, otherwise return false. */
11672 : :
11673 : : bool
11674 : 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11675 : : {
11676 : 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11677 : :
11678 : 0 : if (iv_limit == -1)
11679 : : return true;
11680 : :
11681 : 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11682 : 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
11683 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11684 : :
11685 : 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11686 : : return true;
11687 : :
11688 : : return false;
11689 : 0 : }
|