Branch data Line data Source code
1 : : /* Loop Vectorization
2 : : Copyright (C) 2003-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : : Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #define INCLUDE_ALGORITHM
23 : : #include "config.h"
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "cfghooks.h"
32 : : #include "tree-pass.h"
33 : : #include "ssa.h"
34 : : #include "optabs-tree.h"
35 : : #include "memmodel.h"
36 : : #include "optabs.h"
37 : : #include "diagnostic-core.h"
38 : : #include "fold-const.h"
39 : : #include "stor-layout.h"
40 : : #include "cfganal.h"
41 : : #include "gimplify.h"
42 : : #include "gimple-iterator.h"
43 : : #include "gimplify-me.h"
44 : : #include "tree-ssa-loop-ivopts.h"
45 : : #include "tree-ssa-loop-manip.h"
46 : : #include "tree-ssa-loop-niter.h"
47 : : #include "tree-ssa-loop.h"
48 : : #include "cfgloop.h"
49 : : #include "tree-scalar-evolution.h"
50 : : #include "tree-vectorizer.h"
51 : : #include "gimple-fold.h"
52 : : #include "cgraph.h"
53 : : #include "tree-cfg.h"
54 : : #include "tree-if-conv.h"
55 : : #include "internal-fn.h"
56 : : #include "tree-vector-builder.h"
57 : : #include "vec-perm-indices.h"
58 : : #include "tree-eh.h"
59 : : #include "case-cfn-macros.h"
60 : : #include "langhooks.h"
61 : :
62 : : /* Loop Vectorization Pass.
63 : :
64 : : This pass tries to vectorize loops.
65 : :
66 : : For example, the vectorizer transforms the following simple loop:
67 : :
68 : : short a[N]; short b[N]; short c[N]; int i;
69 : :
70 : : for (i=0; i<N; i++){
71 : : a[i] = b[i] + c[i];
72 : : }
73 : :
74 : : as if it was manually vectorized by rewriting the source code into:
75 : :
76 : : typedef int __attribute__((mode(V8HI))) v8hi;
77 : : short a[N]; short b[N]; short c[N]; int i;
78 : : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 : : v8hi va, vb, vc;
80 : :
81 : : for (i=0; i<N/8; i++){
82 : : vb = pb[i];
83 : : vc = pc[i];
84 : : va = vb + vc;
85 : : pa[i] = va;
86 : : }
87 : :
88 : : The main entry to this pass is vectorize_loops(), in which
89 : : the vectorizer applies a set of analyses on a given set of loops,
90 : : followed by the actual vectorization transformation for the loops that
91 : : had successfully passed the analysis phase.
92 : : Throughout this pass we make a distinction between two types of
93 : : data: scalars (which are represented by SSA_NAMES), and memory references
94 : : ("data-refs"). These two types of data require different handling both
95 : : during analysis and transformation. The types of data-refs that the
96 : : vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 : : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 : : accesses are required to have a simple (consecutive) access pattern.
99 : :
100 : : Analysis phase:
101 : : ===============
102 : : The driver for the analysis phase is vect_analyze_loop().
103 : : It applies a set of analyses, some of which rely on the scalar evolution
104 : : analyzer (scev) developed by Sebastian Pop.
105 : :
106 : : During the analysis phase the vectorizer records some information
107 : : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 : : loop, as well as general information about the loop as a whole, which is
109 : : recorded in a "loop_vec_info" struct attached to each loop.
110 : :
111 : : Transformation phase:
112 : : =====================
113 : : The loop transformation phase scans all the stmts in the loop, and
114 : : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 : : the loop that needs to be vectorized. It inserts the vector code sequence
116 : : just before the scalar stmt S, and records a pointer to the vector code
117 : : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 : : attached to S). This pointer will be used for the vectorization of following
119 : : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 : : otherwise, we rely on dead code elimination for removing it.
121 : :
122 : : For example, say stmt S1 was vectorized into stmt VS1:
123 : :
124 : : VS1: vb = px[i];
125 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 : : S2: a = b;
127 : :
128 : : To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 : : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 : : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 : : resulting sequence would be:
132 : :
133 : : VS1: vb = px[i];
134 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 : : VS2: va = vb;
136 : : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
137 : :
138 : : Operands that are not SSA_NAMEs, are data-refs that appear in
139 : : load/store operations (like 'x[i]' in S1), and are handled differently.
140 : :
141 : : Target modeling:
142 : : =================
143 : : Currently the only target specific information that is used is the
144 : : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 : : Targets that can support different sizes of vectors, for now will need
146 : : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 : : flexibility will be added in the future.
148 : :
149 : : Since we only vectorize operations which vector form can be
150 : : expressed using existing tree codes, to verify that an operation is
151 : : supported, the vectorizer checks the relevant optab at the relevant
152 : : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 : : the value found is CODE_FOR_nothing, then there's no target support, and
154 : : we can't vectorize the stmt.
155 : :
156 : : For additional information on this project see:
157 : : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
158 : : */
159 : :
160 : : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 : : unsigned *);
162 : : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 : : bool *, bool *, bool);
164 : :
165 : : /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 : : statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 : : may already be set for general statements (not just data refs). */
168 : :
169 : : static opt_result
170 : 3119287 : vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 : : bool vectype_maybe_set_p,
172 : : poly_uint64 *vf)
173 : : {
174 : 3119287 : gimple *stmt = stmt_info->stmt;
175 : :
176 : 3119287 : if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 : 1509215 : && !STMT_VINFO_LIVE_P (stmt_info))
178 : 3119382 : || gimple_clobber_p (stmt))
179 : : {
180 : 1509120 : if (dump_enabled_p ())
181 : 107292 : dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 : 1509120 : return opt_result::success ();
183 : : }
184 : :
185 : 1610167 : tree stmt_vectype, nunits_vectype;
186 : 1610167 : opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 : : &stmt_vectype,
188 : : &nunits_vectype);
189 : 1610167 : if (!res)
190 : 2353 : return res;
191 : :
192 : 1607814 : if (stmt_vectype)
193 : : {
194 : 1607788 : if (STMT_VINFO_VECTYPE (stmt_info))
195 : : /* The only case when a vectype had been already set is for stmts
196 : : that contain a data ref, or for "pattern-stmts" (stmts generated
197 : : by the vectorizer to represent/replace a certain idiom). */
198 : 963152 : gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 : : || vectype_maybe_set_p)
200 : : && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 : : else
202 : 644636 : STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
203 : : }
204 : :
205 : 1607814 : if (nunits_vectype)
206 : 1607788 : vect_update_max_nunits (vf, nunits_vectype);
207 : :
208 : 1607814 : return opt_result::success ();
209 : : }
210 : :
211 : : /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 : : types of STMT_INFO and all attached pattern statements and update
213 : : the vectorization factor VF accordingly. Return true on success
214 : : or false if something prevented vectorization. */
215 : :
216 : : static opt_result
217 : 2515856 : vect_determine_vf_for_stmt (vec_info *vinfo,
218 : : stmt_vec_info stmt_info, poly_uint64 *vf)
219 : : {
220 : 2515856 : if (dump_enabled_p ())
221 : 190259 : dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 : : stmt_info->stmt);
223 : 2515856 : opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 : 2515856 : if (!res)
225 : 2353 : return res;
226 : :
227 : 2513503 : if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 : 292932 : && STMT_VINFO_RELATED_STMT (stmt_info))
229 : : {
230 : 292932 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 : 292932 : stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
232 : :
233 : : /* If a pattern statement has def stmts, analyze them too. */
234 : 292932 : for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 : 603431 : !gsi_end_p (si); gsi_next (&si))
236 : : {
237 : 310499 : stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 : 310499 : if (dump_enabled_p ())
239 : 19327 : dump_printf_loc (MSG_NOTE, vect_location,
240 : : "==> examining pattern def stmt: %G",
241 : : def_stmt_info->stmt);
242 : 310499 : res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 : 310499 : if (!res)
244 : 0 : return res;
245 : : }
246 : :
247 : 292932 : if (dump_enabled_p ())
248 : 15112 : dump_printf_loc (MSG_NOTE, vect_location,
249 : : "==> examining pattern statement: %G",
250 : : stmt_info->stmt);
251 : 292932 : res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 : 292932 : if (!res)
253 : 0 : return res;
254 : : }
255 : :
256 : 2513503 : return opt_result::success ();
257 : : }
258 : :
259 : : /* Function vect_determine_vectorization_factor
260 : :
261 : : Determine the vectorization factor (VF). VF is the number of data elements
262 : : that are operated upon in parallel in a single iteration of the vectorized
263 : : loop. For example, when vectorizing a loop that operates on 4byte elements,
264 : : on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 : : elements can fit in a single vector register.
266 : :
267 : : We currently support vectorization of loops in which all types operated upon
268 : : are of the same size. Therefore this function currently sets VF according to
269 : : the size of the types operated upon, and fails if there are multiple sizes
270 : : in the loop.
271 : :
272 : : VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 : : original loop:
274 : : for (i=0; i<N; i++){
275 : : a[i] = b[i] + c[i];
276 : : }
277 : :
278 : : vectorized loop:
279 : : for (i=0; i<N; i+=VF){
280 : : a[i:VF] = b[i:VF] + c[i:VF];
281 : : }
282 : : */
283 : :
284 : : static opt_result
285 : 277340 : vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
286 : : {
287 : 277340 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 : 277340 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 : 277340 : unsigned nbbs = loop->num_nodes;
290 : 277340 : poly_uint64 vectorization_factor = 1;
291 : 277340 : tree scalar_type = NULL_TREE;
292 : 277340 : gphi *phi;
293 : 277340 : tree vectype;
294 : 277340 : stmt_vec_info stmt_info;
295 : 277340 : unsigned i;
296 : :
297 : 277340 : DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
298 : :
299 : 916338 : for (i = 0; i < nbbs; i++)
300 : : {
301 : 661384 : basic_block bb = bbs[i];
302 : :
303 : 1326190 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 : 664806 : gsi_next (&si))
305 : : {
306 : 684839 : phi = si.phi ();
307 : 684839 : stmt_info = loop_vinfo->lookup_stmt (phi);
308 : 684839 : if (dump_enabled_p ())
309 : 47390 : dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 : : (gimple *) phi);
311 : :
312 : 684839 : gcc_assert (stmt_info);
313 : :
314 : 684839 : if (STMT_VINFO_RELEVANT_P (stmt_info)
315 : 383389 : || STMT_VINFO_LIVE_P (stmt_info))
316 : : {
317 : 301450 : gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 : 301450 : scalar_type = TREE_TYPE (PHI_RESULT (phi));
319 : :
320 : 301450 : if (dump_enabled_p ())
321 : 10119 : dump_printf_loc (MSG_NOTE, vect_location,
322 : : "get vectype for scalar type: %T\n",
323 : : scalar_type);
324 : :
325 : 301450 : vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 : 301450 : if (!vectype)
327 : 20033 : return opt_result::failure_at (phi,
328 : : "not vectorized: unsupported "
329 : : "data-type %T\n",
330 : : scalar_type);
331 : 281417 : STMT_VINFO_VECTYPE (stmt_info) = vectype;
332 : :
333 : 281417 : if (dump_enabled_p ())
334 : 10053 : dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 : : vectype);
336 : :
337 : 281417 : if (dump_enabled_p ())
338 : : {
339 : 10053 : dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 : 10053 : dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 : 10053 : dump_printf (MSG_NOTE, "\n");
342 : : }
343 : :
344 : 281417 : vect_update_max_nunits (&vectorization_factor, vectype);
345 : : }
346 : : }
347 : :
348 : 5017392 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 : 3734690 : gsi_next (&si))
350 : : {
351 : 3737043 : if (is_gimple_debug (gsi_stmt (si)))
352 : 1221187 : continue;
353 : 2515856 : stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 : 2515856 : opt_result res
355 : 2515856 : = vect_determine_vf_for_stmt (loop_vinfo,
356 : : stmt_info, &vectorization_factor);
357 : 2515856 : if (!res)
358 : 2353 : return res;
359 : : }
360 : : }
361 : :
362 : : /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 : 254954 : if (dump_enabled_p ())
364 : : {
365 : 15240 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 : 15240 : dump_dec (MSG_NOTE, vectorization_factor);
367 : 15240 : dump_printf (MSG_NOTE, "\n");
368 : : }
369 : :
370 : 254954 : if (known_le (vectorization_factor, 1U))
371 : 33755 : return opt_result::failure_at (vect_location,
372 : : "not vectorized: unsupported data-type\n");
373 : 221199 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 : 221199 : return opt_result::success ();
375 : : }
376 : :
377 : :
378 : : /* Function vect_is_simple_iv_evolution.
379 : :
380 : : FORNOW: A simple evolution of an induction variables in the loop is
381 : : considered a polynomial evolution. */
382 : :
383 : : static bool
384 : 646576 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 : : tree * step)
386 : : {
387 : 646576 : tree init_expr;
388 : 646576 : tree step_expr;
389 : 646576 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 : 646576 : basic_block bb;
391 : :
392 : : /* When there is no evolution in this loop, the evolution function
393 : : is not "simple". */
394 : 646576 : if (evolution_part == NULL_TREE)
395 : : return false;
396 : :
397 : : /* When the evolution is a polynomial of degree >= 2
398 : : the evolution function is not "simple". */
399 : 687747 : if (tree_is_chrec (evolution_part))
400 : : return false;
401 : :
402 : 596743 : step_expr = evolution_part;
403 : 596743 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
404 : :
405 : 596743 : if (dump_enabled_p ())
406 : 35809 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 : : step_expr, init_expr);
408 : :
409 : 596743 : *init = init_expr;
410 : 596743 : *step = step_expr;
411 : :
412 : 596743 : if (TREE_CODE (step_expr) != INTEGER_CST
413 : 47800 : && (TREE_CODE (step_expr) != SSA_NAME
414 : 42601 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 : 42427 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 : 6653 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 : 111 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 : 111 : || !flag_associative_math)))
419 : 637971 : && (TREE_CODE (step_expr) != REAL_CST
420 : 507 : || !flag_associative_math))
421 : : {
422 : 41171 : if (dump_enabled_p ())
423 : 2661 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 : : "step unknown.\n");
425 : 41171 : return false;
426 : : }
427 : :
428 : : return true;
429 : : }
430 : :
431 : : /* Function vect_is_nonlinear_iv_evolution
432 : :
433 : : Only support nonlinear induction for integer type
434 : : 1. neg
435 : : 2. mul by constant
436 : : 3. lshift/rshift by constant.
437 : :
438 : : For neg induction, return a fake step as integer -1. */
439 : : static bool
440 : 88719 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 : : gphi* loop_phi_node, tree *init, tree *step)
442 : : {
443 : 88719 : tree init_expr, ev_expr, result, op1, op2;
444 : 88719 : gimple* def;
445 : :
446 : 88719 : if (gimple_phi_num_args (loop_phi_node) != 2)
447 : : return false;
448 : :
449 : 88719 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 : 88719 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
451 : :
452 : : /* Support nonlinear induction only for integer type. */
453 : 88719 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 : : return false;
455 : :
456 : 66972 : *init = init_expr;
457 : 66972 : result = PHI_RESULT (loop_phi_node);
458 : :
459 : 66972 : if (TREE_CODE (ev_expr) != SSA_NAME
460 : 64143 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 : 66972 : || !is_gimple_assign (def))
462 : : return false;
463 : :
464 : 59368 : enum tree_code t_code = gimple_assign_rhs_code (def);
465 : 59368 : switch (t_code)
466 : : {
467 : 1550 : case NEGATE_EXPR:
468 : 1550 : if (gimple_assign_rhs1 (def) != result)
469 : : return false;
470 : 1550 : *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 : 1550 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 : 1550 : break;
473 : :
474 : 9480 : case RSHIFT_EXPR:
475 : 9480 : case LSHIFT_EXPR:
476 : 9480 : case MULT_EXPR:
477 : 9480 : op1 = gimple_assign_rhs1 (def);
478 : 9480 : op2 = gimple_assign_rhs2 (def);
479 : 9480 : if (TREE_CODE (op2) != INTEGER_CST
480 : 6041 : || op1 != result)
481 : : return false;
482 : 5924 : *step = op2;
483 : 5924 : if (t_code == LSHIFT_EXPR)
484 : 186 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 : 5738 : else if (t_code == RSHIFT_EXPR)
486 : 5129 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 : : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 : : else
489 : 609 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 : : break;
491 : :
492 : : default:
493 : : return false;
494 : : }
495 : :
496 : 7474 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 : 7474 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
498 : :
499 : 7474 : return true;
500 : : }
501 : :
502 : : /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 : : what we are assuming is a double reduction. For example, given
504 : : a structure like this:
505 : :
506 : : outer1:
507 : : x_1 = PHI <x_4(outer2), ...>;
508 : : ...
509 : :
510 : : inner:
511 : : x_2 = PHI <x_1(outer1), ...>;
512 : : ...
513 : : x_3 = ...;
514 : : ...
515 : :
516 : : outer2:
517 : : x_4 = PHI <x_3(inner)>;
518 : : ...
519 : :
520 : : outer loop analysis would treat x_1 as a double reduction phi and
521 : : this function would then return true for x_2. */
522 : :
523 : : static bool
524 : 647542 : vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
525 : : {
526 : 647542 : use_operand_p use_p;
527 : 647542 : ssa_op_iter op_iter;
528 : 1941634 : FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 : 1295058 : if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 : 646366 : if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 : : return true;
532 : : return false;
533 : : }
534 : :
535 : : /* Returns true if Phi is a first-order recurrence. A first-order
536 : : recurrence is a non-reduction recurrence relation in which the value of
537 : : the recurrence in the current loop iteration equals a value defined in
538 : : the previous iteration. */
539 : :
540 : : static bool
541 : 21797 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 : : gphi *phi)
543 : : {
544 : : /* A nested cycle isn't vectorizable as first order recurrence. */
545 : 21797 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 : : return false;
547 : :
548 : : /* Ensure the loop latch definition is from within the loop. */
549 : 21560 : edge latch = loop_latch_edge (loop);
550 : 21560 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 : 21560 : if (TREE_CODE (ldef) != SSA_NAME
552 : 18456 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 : 18428 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 : 38805 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 : 4625 : return false;
556 : :
557 : 16935 : tree def = gimple_phi_result (phi);
558 : :
559 : : /* Ensure every use_stmt of the phi node is dominated by the latch
560 : : definition. */
561 : 16935 : imm_use_iterator imm_iter;
562 : 16935 : use_operand_p use_p;
563 : 18875 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 : 18563 : if (!is_gimple_debug (USE_STMT (use_p))
565 : 36244 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 : 10665 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 : : USE_STMT (use_p))))
568 : 16623 : return false;
569 : :
570 : : /* First-order recurrence autovectorization needs shuffle vector. */
571 : 312 : tree scalar_type = TREE_TYPE (def);
572 : 312 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 : 312 : if (!vectype)
574 : : return false;
575 : :
576 : : return true;
577 : : }
578 : :
579 : : /* Function vect_analyze_scalar_cycles_1.
580 : :
581 : : Examine the cross iteration def-use cycles of scalar variables
582 : : in LOOP. LOOP_VINFO represents the loop that is now being
583 : : considered for vectorization (can be LOOP, or an outer-loop
584 : : enclosing LOOP). SLP indicates there will be some subsequent
585 : : slp analyses or not. */
586 : :
587 : : static void
588 : 315307 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 : : bool slp)
590 : : {
591 : 315307 : basic_block bb = loop->header;
592 : 315307 : tree init, step;
593 : 315307 : auto_vec<stmt_vec_info, 64> worklist;
594 : 315307 : gphi_iterator gsi;
595 : 315307 : bool double_reduc, reduc_chain;
596 : :
597 : 315307 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
598 : :
599 : : /* First - identify all inductions. Reduction detection assumes that all the
600 : : inductions have been identified, therefore, this order must not be
601 : : changed. */
602 : 1136948 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
603 : : {
604 : 821641 : gphi *phi = gsi.phi ();
605 : 821641 : tree access_fn = NULL;
606 : 821641 : tree def = PHI_RESULT (phi);
607 : 821641 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
608 : :
609 : 821641 : if (dump_enabled_p ())
610 : 50990 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 : : (gimple *) phi);
612 : :
613 : : /* Skip virtual phi's. The data dependences that are associated with
614 : : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 : 1643282 : if (virtual_operand_p (def))
616 : 258601 : continue;
617 : :
618 : 647542 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
619 : :
620 : : /* Analyze the evolution function. */
621 : 647542 : access_fn = analyze_scalar_evolution (loop, def);
622 : 647542 : if (access_fn)
623 : : {
624 : 647542 : STRIP_NOPS (access_fn);
625 : 647542 : if (dump_enabled_p ())
626 : 37645 : dump_printf_loc (MSG_NOTE, vect_location,
627 : : "Access function of PHI: %T\n", access_fn);
628 : 647542 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 : 647542 : = initial_condition_in_loop_num (access_fn, loop->num);
630 : 647542 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 : 647542 : = evolution_part_in_loop_num (access_fn, loop->num);
632 : : }
633 : :
634 : 732044 : if ((!access_fn
635 : 647542 : || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 : 646576 : || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 : : &init, &step)
638 : 555572 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 : 9831 : && TREE_CODE (step) != INTEGER_CST))
640 : : /* Only handle nonlinear iv for same loop. */
641 : 739518 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 : 88719 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 : : phi, &init, &step)))
644 : : {
645 : 84502 : worklist.safe_push (stmt_vinfo);
646 : 84502 : continue;
647 : : }
648 : :
649 : 563040 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 : : != NULL_TREE);
651 : 563040 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
652 : :
653 : 563040 : if (dump_enabled_p ())
654 : 33252 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 : 563040 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
656 : :
657 : : /* Mark if we have a non-linear IV. */
658 : 563040 : LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
659 : 563040 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
660 : : }
661 : :
662 : :
663 : : /* Second - identify all reductions and nested cycles. */
664 : 399809 : while (worklist.length () > 0)
665 : : {
666 : 84502 : stmt_vec_info stmt_vinfo = worklist.pop ();
667 : 84502 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
668 : 84502 : tree def = PHI_RESULT (phi);
669 : :
670 : 84502 : if (dump_enabled_p ())
671 : 4393 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
672 : : (gimple *) phi);
673 : :
674 : 169004 : gcc_assert (!virtual_operand_p (def)
675 : : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
676 : :
677 : 84502 : stmt_vec_info reduc_stmt_info
678 : 84502 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
679 : 84502 : &reduc_chain, slp);
680 : 84502 : if (reduc_stmt_info)
681 : : {
682 : 62705 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
683 : 62705 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
684 : 62705 : if (double_reduc)
685 : : {
686 : 966 : if (dump_enabled_p ())
687 : 101 : dump_printf_loc (MSG_NOTE, vect_location,
688 : : "Detected double reduction.\n");
689 : :
690 : 966 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
691 : 966 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
692 : : /* Make it accessible for SLP vectorization. */
693 : 966 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
694 : : }
695 : : else
696 : : {
697 : 61739 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
698 : : {
699 : 3020 : if (dump_enabled_p ())
700 : 465 : dump_printf_loc (MSG_NOTE, vect_location,
701 : : "Detected vectorizable nested cycle.\n");
702 : :
703 : 3020 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
704 : : }
705 : : else
706 : : {
707 : 58719 : if (dump_enabled_p ())
708 : 3362 : dump_printf_loc (MSG_NOTE, vect_location,
709 : : "Detected reduction.\n");
710 : :
711 : 58719 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
712 : 58719 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
713 : : /* Store the reduction cycles for possible vectorization in
714 : : loop-aware SLP if it was not detected as reduction
715 : : chain. */
716 : 58719 : if (! reduc_chain)
717 : 57964 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
718 : 57964 : (reduc_stmt_info);
719 : : }
720 : : }
721 : : }
722 : 21797 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
723 : 306 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
724 : : else
725 : 21491 : if (dump_enabled_p ())
726 : 414 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
727 : : "Unknown def-use cycle pattern.\n");
728 : : }
729 : 315307 : }
730 : :
731 : :
732 : : /* Function vect_analyze_scalar_cycles.
733 : :
734 : : Examine the cross iteration def-use cycles of scalar variables, by
735 : : analyzing the loop-header PHIs of scalar variables. Classify each
736 : : cycle as one of the following: invariant, induction, reduction, unknown.
737 : : We do that for the loop represented by LOOP_VINFO, and also to its
738 : : inner-loop, if exists.
739 : : Examples for scalar cycles:
740 : :
741 : : Example1: reduction:
742 : :
743 : : loop1:
744 : : for (i=0; i<N; i++)
745 : : sum += a[i];
746 : :
747 : : Example2: induction:
748 : :
749 : : loop2:
750 : : for (i=0; i<N; i++)
751 : : a[i] = i; */
752 : :
753 : : static void
754 : 310320 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
755 : : {
756 : 310320 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
757 : :
758 : 310320 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
759 : :
760 : : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
761 : : Reductions in such inner-loop therefore have different properties than
762 : : the reductions in the nest that gets vectorized:
763 : : 1. When vectorized, they are executed in the same order as in the original
764 : : scalar loop, so we can't change the order of computation when
765 : : vectorizing them.
766 : : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
767 : : current checks are too strict. */
768 : :
769 : 310320 : if (loop->inner)
770 : 4987 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
771 : 310320 : }
772 : :
773 : : /* Transfer group and reduction information from STMT_INFO to its
774 : : pattern stmt. */
775 : :
776 : : static void
777 : 28 : vect_fixup_reduc_chain (stmt_vec_info stmt_info)
778 : : {
779 : 28 : stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
780 : 28 : stmt_vec_info stmtp;
781 : 28 : gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
782 : : && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
783 : 28 : REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
784 : 242 : do
785 : : {
786 : 242 : stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
787 : 242 : gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
788 : : == STMT_VINFO_DEF_TYPE (stmt_info));
789 : 242 : REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
790 : 242 : stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
791 : 242 : if (stmt_info)
792 : 214 : REDUC_GROUP_NEXT_ELEMENT (stmtp)
793 : 214 : = STMT_VINFO_RELATED_STMT (stmt_info);
794 : : }
795 : 242 : while (stmt_info);
796 : 28 : }
797 : :
798 : : /* Fixup scalar cycles that now have their stmts detected as patterns. */
799 : :
800 : : static void
801 : 310320 : vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
802 : : {
803 : 310320 : stmt_vec_info first;
804 : 310320 : unsigned i;
805 : :
806 : 311075 : FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
807 : : {
808 : 755 : stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
809 : 3338 : while (next)
810 : : {
811 : 2611 : if ((STMT_VINFO_IN_PATTERN_P (next)
812 : 2611 : != STMT_VINFO_IN_PATTERN_P (first))
813 : 5194 : || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
814 : : break;
815 : 2583 : next = REDUC_GROUP_NEXT_ELEMENT (next);
816 : : }
817 : : /* If all reduction chain members are well-formed patterns adjust
818 : : the group to group the pattern stmts instead. */
819 : 755 : if (! next
820 : 783 : && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
821 : : {
822 : 727 : if (STMT_VINFO_IN_PATTERN_P (first))
823 : : {
824 : 28 : vect_fixup_reduc_chain (first);
825 : 56 : LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
826 : 28 : = STMT_VINFO_RELATED_STMT (first);
827 : : }
828 : : }
829 : : /* If not all stmt in the chain are patterns or if we failed
830 : : to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
831 : : it as regular reduction instead. */
832 : : else
833 : : {
834 : : stmt_vec_info vinfo = first;
835 : : stmt_vec_info last = NULL;
836 : 117 : while (vinfo)
837 : : {
838 : 89 : next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
839 : 89 : REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
840 : 89 : REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
841 : 89 : last = vinfo;
842 : 89 : vinfo = next;
843 : : }
844 : 28 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
845 : 28 : = vect_internal_def;
846 : 31 : loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
847 : 28 : LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
848 : 28 : --i;
849 : : }
850 : : }
851 : 310320 : }
852 : :
853 : : /* Function vect_get_loop_niters.
854 : :
855 : : Determine how many iterations the loop is executed and place it
856 : : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
857 : : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
858 : : niter information holds in ASSUMPTIONS.
859 : :
860 : : Return the loop exit conditions. */
861 : :
862 : :
863 : : static vec<gcond *>
864 : 259528 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
865 : : tree *number_of_iterations, tree *number_of_iterationsm1)
866 : : {
867 : 259528 : auto_vec<edge> exits = get_loop_exit_edges (loop);
868 : 259528 : vec<gcond *> conds;
869 : 519056 : conds.create (exits.length ());
870 : 259528 : class tree_niter_desc niter_desc;
871 : 259528 : tree niter_assumptions, niter, may_be_zero;
872 : :
873 : 259528 : *assumptions = boolean_true_node;
874 : 259528 : *number_of_iterationsm1 = chrec_dont_know;
875 : 259528 : *number_of_iterations = chrec_dont_know;
876 : :
877 : 259528 : DUMP_VECT_SCOPE ("get_loop_niters");
878 : :
879 : 259528 : if (exits.is_empty ())
880 : 0 : return conds;
881 : :
882 : 259528 : if (dump_enabled_p ())
883 : 13546 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
884 : : exits.length ());
885 : :
886 : : edge exit;
887 : : unsigned int i;
888 : 627453 : FOR_EACH_VEC_ELT (exits, i, exit)
889 : : {
890 : 367925 : gcond *cond = get_loop_exit_condition (exit);
891 : 367925 : if (cond)
892 : 357743 : conds.safe_push (cond);
893 : :
894 : 367925 : if (dump_enabled_p ())
895 : 14507 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
896 : :
897 : 367925 : if (exit != main_exit)
898 : 148711 : continue;
899 : :
900 : 259528 : may_be_zero = NULL_TREE;
901 : 259528 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
902 : 259528 : || chrec_contains_undetermined (niter_desc.niter))
903 : 40314 : continue;
904 : :
905 : 219214 : niter_assumptions = niter_desc.assumptions;
906 : 219214 : may_be_zero = niter_desc.may_be_zero;
907 : 219214 : niter = niter_desc.niter;
908 : :
909 : 219214 : if (may_be_zero && integer_zerop (may_be_zero))
910 : : may_be_zero = NULL_TREE;
911 : :
912 : 13013 : if (may_be_zero)
913 : : {
914 : 13013 : if (COMPARISON_CLASS_P (may_be_zero))
915 : : {
916 : : /* Try to combine may_be_zero with assumptions, this can simplify
917 : : computation of niter expression. */
918 : 13013 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
919 : 1092 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
920 : : niter_assumptions,
921 : : fold_build1 (TRUTH_NOT_EXPR,
922 : : boolean_type_node,
923 : : may_be_zero));
924 : : else
925 : 11921 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
926 : : build_int_cst (TREE_TYPE (niter), 0),
927 : : rewrite_to_non_trapping_overflow (niter));
928 : :
929 : 219214 : may_be_zero = NULL_TREE;
930 : : }
931 : 0 : else if (integer_nonzerop (may_be_zero))
932 : : {
933 : 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
934 : 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
935 : 0 : continue;
936 : : }
937 : : else
938 : 0 : continue;
939 : : }
940 : :
941 : : /* Loop assumptions are based off the normal exit. */
942 : 219214 : *assumptions = niter_assumptions;
943 : 219214 : *number_of_iterationsm1 = niter;
944 : :
945 : : /* We want the number of loop header executions which is the number
946 : : of latch executions plus one.
947 : : ??? For UINT_MAX latch executions this number overflows to zero
948 : : for loops like do { n++; } while (n != 0); */
949 : 219214 : if (niter && !chrec_contains_undetermined (niter))
950 : : {
951 : 219214 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
952 : : unshare_expr (niter),
953 : : build_int_cst (TREE_TYPE (niter), 1));
954 : 219214 : if (TREE_CODE (niter) == INTEGER_CST
955 : 118593 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
956 : : {
957 : : /* If we manage to fold niter + 1 into INTEGER_CST even when
958 : : niter is some complex expression, ensure back
959 : : *number_of_iterationsm1 is an INTEGER_CST as well. See
960 : : PR113210. */
961 : 4 : *number_of_iterationsm1
962 : 4 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
963 : : build_minus_one_cst (TREE_TYPE (niter)));
964 : : }
965 : : }
966 : 219214 : *number_of_iterations = niter;
967 : : }
968 : :
969 : 259528 : if (dump_enabled_p ())
970 : 13546 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
971 : :
972 : 259528 : return conds;
973 : 259528 : }
974 : :
975 : : /* Determine the main loop exit for the vectorizer. */
976 : :
977 : : edge
978 : 488020 : vec_init_loop_exit_info (class loop *loop)
979 : : {
980 : : /* Before we begin we must first determine which exit is the main one and
981 : : which are auxilary exits. */
982 : 488020 : auto_vec<edge> exits = get_loop_exit_edges (loop);
983 : 488020 : if (exits.length () == 1)
984 : 311717 : return exits[0];
985 : :
986 : : /* If we have multiple exits we only support counting IV at the moment.
987 : : Analyze all exits and return the last one we can analyze. */
988 : 176303 : class tree_niter_desc niter_desc;
989 : 176303 : edge candidate = NULL;
990 : 1154798 : for (edge exit : exits)
991 : : {
992 : 635751 : if (!get_loop_exit_condition (exit))
993 : 149982 : continue;
994 : :
995 : 485769 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
996 : 485769 : && !chrec_contains_undetermined (niter_desc.niter))
997 : : {
998 : 134794 : tree may_be_zero = niter_desc.may_be_zero;
999 : 134794 : if ((integer_zerop (may_be_zero)
1000 : : /* As we are handling may_be_zero that's not false by
1001 : : rewriting niter to may_be_zero ? 0 : niter we require
1002 : : an empty latch. */
1003 : 648632 : || (single_pred_p (loop->latch)
1004 : 12520 : && exit->src == single_pred (loop->latch)
1005 : 4173 : && (integer_nonzerop (may_be_zero)
1006 : 4173 : || COMPARISON_CLASS_P (may_be_zero))))
1007 : 138967 : && (!candidate
1008 : 6477 : || dominated_by_p (CDI_DOMINATORS, exit->src,
1009 : 6477 : candidate->src)))
1010 : : candidate = exit;
1011 : : }
1012 : : }
1013 : :
1014 : 176303 : return candidate;
1015 : 176303 : }
1016 : :
1017 : : /* Function bb_in_loop_p
1018 : :
1019 : : Used as predicate for dfs order traversal of the loop bbs. */
1020 : :
1021 : : static bool
1022 : 1292729 : bb_in_loop_p (const_basic_block bb, const void *data)
1023 : : {
1024 : 1292729 : const class loop *const loop = (const class loop *)data;
1025 : 1292729 : if (flow_bb_inside_loop_p (loop, bb))
1026 : : return true;
1027 : : return false;
1028 : : }
1029 : :
1030 : :
1031 : : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1032 : : stmt_vec_info structs for all the stmts in LOOP_IN. */
1033 : :
1034 : 407794 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1035 : : : vec_info (vec_info::loop, shared),
1036 : 407794 : loop (loop_in),
1037 : 407794 : num_itersm1 (NULL_TREE),
1038 : 407794 : num_iters (NULL_TREE),
1039 : 407794 : num_iters_unchanged (NULL_TREE),
1040 : 407794 : num_iters_assumptions (NULL_TREE),
1041 : 407794 : vector_costs (nullptr),
1042 : 407794 : scalar_costs (nullptr),
1043 : 407794 : th (0),
1044 : 407794 : versioning_threshold (0),
1045 : 407794 : vectorization_factor (0),
1046 : 407794 : main_loop_edge (nullptr),
1047 : 407794 : skip_main_loop_edge (nullptr),
1048 : 407794 : skip_this_loop_edge (nullptr),
1049 : 407794 : reusable_accumulators (),
1050 : 407794 : suggested_unroll_factor (1),
1051 : 407794 : max_vectorization_factor (0),
1052 : 407794 : mask_skip_niters (NULL_TREE),
1053 : 407794 : mask_skip_niters_pfa_offset (NULL_TREE),
1054 : 407794 : rgroup_compare_type (NULL_TREE),
1055 : 407794 : simd_if_cond (NULL_TREE),
1056 : 407794 : partial_vector_style (vect_partial_vectors_none),
1057 : 407794 : unaligned_dr (NULL),
1058 : 407794 : peeling_for_alignment (0),
1059 : 407794 : ptr_mask (0),
1060 : 407794 : nonlinear_iv (false),
1061 : 407794 : ivexpr_map (NULL),
1062 : 407794 : scan_map (NULL),
1063 : 407794 : slp_unrolling_factor (1),
1064 : 407794 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1065 : 407794 : vectorizable (false),
1066 : 407794 : can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1067 : 407794 : must_use_partial_vectors_p (false),
1068 : 407794 : using_partial_vectors_p (false),
1069 : 407794 : using_decrementing_iv_p (false),
1070 : 407794 : using_select_vl_p (false),
1071 : 407794 : epil_using_partial_vectors_p (false),
1072 : 407794 : allow_mutual_alignment (false),
1073 : 407794 : partial_load_store_bias (0),
1074 : 407794 : peeling_for_gaps (false),
1075 : 407794 : peeling_for_niter (false),
1076 : 407794 : early_breaks (false),
1077 : 407794 : user_unroll (false),
1078 : 407794 : no_data_dependencies (false),
1079 : 407794 : has_mask_store (false),
1080 : 407794 : scalar_loop_scaling (profile_probability::uninitialized ()),
1081 : 407794 : scalar_loop (NULL),
1082 : 407794 : main_loop_info (NULL),
1083 : 407794 : orig_loop_info (NULL),
1084 : 407794 : epilogue_vinfo (NULL),
1085 : 407794 : drs_advanced_by (NULL_TREE),
1086 : 407794 : vec_loop_iv_exit (NULL),
1087 : 407794 : vec_epilogue_loop_iv_exit (NULL),
1088 : 815588 : scalar_loop_iv_exit (NULL)
1089 : : {
1090 : : /* CHECKME: We want to visit all BBs before their successors (except for
1091 : : latch blocks, for which this assertion wouldn't hold). In the simple
1092 : : case of the loop forms we allow, a dfs order of the BBs would the same
1093 : : as reversed postorder traversal, so we are safe. */
1094 : :
1095 : 407794 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
1096 : 815588 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
1097 : 407794 : loop->num_nodes, loop);
1098 : 407794 : gcc_assert (nbbs == loop->num_nodes);
1099 : :
1100 : 1474622 : for (unsigned int i = 0; i < nbbs; i++)
1101 : : {
1102 : 1066828 : basic_block bb = bbs[i];
1103 : 1066828 : gimple_stmt_iterator si;
1104 : :
1105 : 2174947 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1106 : : {
1107 : 1108119 : gimple *phi = gsi_stmt (si);
1108 : 1108119 : gimple_set_uid (phi, 0);
1109 : 1108119 : add_stmt (phi);
1110 : : }
1111 : :
1112 : 9155097 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1113 : : {
1114 : 7021441 : gimple *stmt = gsi_stmt (si);
1115 : 7021441 : gimple_set_uid (stmt, 0);
1116 : 7021441 : if (is_gimple_debug (stmt))
1117 : 2650483 : continue;
1118 : 4370958 : add_stmt (stmt);
1119 : : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1120 : : third argument is the #pragma omp simd if (x) condition, when 0,
1121 : : loop shouldn't be vectorized, when non-zero constant, it should
1122 : : be vectorized normally, otherwise versioned with vectorized loop
1123 : : done if the condition is non-zero at runtime. */
1124 : 4370958 : if (loop_in->simduid
1125 : 43826 : && is_gimple_call (stmt)
1126 : 4292 : && gimple_call_internal_p (stmt)
1127 : 4153 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1128 : 4152 : && gimple_call_num_args (stmt) >= 3
1129 : 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1130 : 4371061 : && (loop_in->simduid
1131 : 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1132 : : {
1133 : 103 : tree arg = gimple_call_arg (stmt, 2);
1134 : 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1135 : 103 : simd_if_cond = arg;
1136 : : else
1137 : 0 : gcc_assert (integer_nonzerop (arg));
1138 : : }
1139 : : }
1140 : : }
1141 : 407794 : }
1142 : :
1143 : : /* Free all levels of rgroup CONTROLS. */
1144 : :
1145 : : void
1146 : 1073312 : release_vec_loop_controls (vec<rgroup_controls> *controls)
1147 : : {
1148 : 1073312 : rgroup_controls *rgc;
1149 : 1073312 : unsigned int i;
1150 : 1073337 : FOR_EACH_VEC_ELT (*controls, i, rgc)
1151 : 25 : rgc->controls.release ();
1152 : 1073312 : controls->release ();
1153 : 1073312 : }
1154 : :
1155 : : /* Free all memory used by the _loop_vec_info, as well as all the
1156 : : stmt_vec_info structs of all the stmts in the loop. */
1157 : :
1158 : 407794 : _loop_vec_info::~_loop_vec_info ()
1159 : : {
1160 : 407794 : free (bbs);
1161 : :
1162 : 407794 : release_vec_loop_controls (&masks.rgc_vec);
1163 : 407794 : release_vec_loop_controls (&lens);
1164 : 411558 : delete ivexpr_map;
1165 : 408116 : delete scan_map;
1166 : 407794 : delete scalar_costs;
1167 : 407794 : delete vector_costs;
1168 : :
1169 : : /* When we release an epiloge vinfo that we do not intend to use
1170 : : avoid clearing AUX of the main loop which should continue to
1171 : : point to the main loop vinfo since otherwise we'll leak that. */
1172 : 407794 : if (loop->aux == this)
1173 : 56876 : loop->aux = NULL;
1174 : 815588 : }
1175 : :
1176 : : /* Return an invariant or register for EXPR and emit necessary
1177 : : computations in the LOOP_VINFO loop preheader. */
1178 : :
1179 : : tree
1180 : 19209 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1181 : : {
1182 : 19209 : if (is_gimple_reg (expr)
1183 : 19209 : || is_gimple_min_invariant (expr))
1184 : 6324 : return expr;
1185 : :
1186 : 12885 : if (! loop_vinfo->ivexpr_map)
1187 : 3764 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1188 : 12885 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1189 : 12885 : if (! cached)
1190 : : {
1191 : 8388 : gimple_seq stmts = NULL;
1192 : 8388 : cached = force_gimple_operand (unshare_expr (expr),
1193 : : &stmts, true, NULL_TREE);
1194 : 8388 : if (stmts)
1195 : : {
1196 : 8246 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1197 : 8246 : gsi_insert_seq_on_edge_immediate (e, stmts);
1198 : : }
1199 : : }
1200 : 12885 : return cached;
1201 : : }
1202 : :
1203 : : /* Return true if we can use CMP_TYPE as the comparison type to produce
1204 : : all masks required to mask LOOP_VINFO. */
1205 : :
1206 : : static bool
1207 : 97 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1208 : : {
1209 : 97 : rgroup_controls *rgm;
1210 : 97 : unsigned int i;
1211 : 110 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1212 : 110 : if (rgm->type != NULL_TREE
1213 : 110 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1214 : : cmp_type, rgm->type,
1215 : : OPTIMIZE_FOR_SPEED))
1216 : : return false;
1217 : : return true;
1218 : : }
1219 : :
1220 : : /* Calculate the maximum number of scalars per iteration for every
1221 : : rgroup in LOOP_VINFO. */
1222 : :
1223 : : static unsigned int
1224 : 23 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1225 : : {
1226 : 23 : unsigned int res = 1;
1227 : 23 : unsigned int i;
1228 : 23 : rgroup_controls *rgm;
1229 : 67 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1230 : 44 : res = MAX (res, rgm->max_nscalars_per_iter);
1231 : 23 : return res;
1232 : : }
1233 : :
1234 : : /* Calculate the minimum precision necessary to represent:
1235 : :
1236 : : MAX_NITERS * FACTOR
1237 : :
1238 : : as an unsigned integer, where MAX_NITERS is the maximum number of
1239 : : loop header iterations for the original scalar form of LOOP_VINFO. */
1240 : :
1241 : : static unsigned
1242 : 23 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1243 : : {
1244 : 23 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1245 : :
1246 : : /* Get the maximum number of iterations that is representable
1247 : : in the counter type. */
1248 : 23 : tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1249 : 23 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1250 : :
1251 : : /* Get a more refined estimate for the number of iterations. */
1252 : 23 : widest_int max_back_edges;
1253 : 23 : if (max_loop_iterations (loop, &max_back_edges))
1254 : 23 : max_ni = wi::smin (max_ni, max_back_edges + 1);
1255 : :
1256 : : /* Work out how many bits we need to represent the limit. */
1257 : 23 : return wi::min_precision (max_ni * factor, UNSIGNED);
1258 : 23 : }
1259 : :
1260 : : /* True if the loop needs peeling or partial vectors when vectorized. */
1261 : :
1262 : : static bool
1263 : 111511 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1264 : : {
1265 : 111511 : unsigned HOST_WIDE_INT const_vf;
1266 : 111511 : HOST_WIDE_INT max_niter
1267 : 111511 : = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1268 : :
1269 : 111511 : unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1270 : 111511 : if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1271 : 14550 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1272 : : (loop_vinfo));
1273 : :
1274 : 111511 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1275 : 51283 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1276 : : {
1277 : : /* Work out the (constant) number of iterations that need to be
1278 : : peeled for reasons other than niters. */
1279 : 51250 : unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1280 : 51250 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1281 : 382 : peel_niter += 1;
1282 : 110467 : if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1283 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1284 : : return true;
1285 : : }
1286 : 60261 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1287 : : /* ??? When peeling for gaps but not alignment, we could
1288 : : try to check whether the (variable) niters is known to be
1289 : : VF * N + 1. That's something of a niche case though. */
1290 : 60034 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1291 : 59164 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1292 : 119425 : || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1293 : 118328 : < (unsigned) exact_log2 (const_vf))
1294 : : /* In case of versioning, check if the maximum number of
1295 : : iterations is greater than th. If they are identical,
1296 : : the epilogue is unnecessary. */
1297 : 58146 : && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1298 : 4283 : || ((unsigned HOST_WIDE_INT) max_niter
1299 : : /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1300 : : but that's only computed later based on our result.
1301 : : The following is the most conservative approximation. */
1302 : 4283 : > (std::max ((unsigned HOST_WIDE_INT) th,
1303 : 4283 : const_vf) / const_vf) * const_vf))))
1304 : 59217 : return true;
1305 : :
1306 : : return false;
1307 : : }
1308 : :
1309 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1310 : : whether we can actually generate the masks required. Return true if so,
1311 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1312 : :
1313 : : static bool
1314 : 23 : vect_verify_full_masking (loop_vec_info loop_vinfo)
1315 : : {
1316 : 23 : unsigned int min_ni_width;
1317 : :
1318 : : /* Use a normal loop if there are no statements that need masking.
1319 : : This only happens in rare degenerate cases: it means that the loop
1320 : : has no loads, no stores, and no live-out values. */
1321 : 23 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1322 : : return false;
1323 : :
1324 : : /* Produce the rgroup controls. */
1325 : 81 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1326 : : {
1327 : 29 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1328 : 29 : tree vectype = mask.first;
1329 : 29 : unsigned nvectors = mask.second;
1330 : :
1331 : 35 : if (masks->rgc_vec.length () < nvectors)
1332 : 27 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1333 : 29 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1334 : : /* The number of scalars per iteration and the number of vectors are
1335 : : both compile-time constants. */
1336 : 29 : unsigned int nscalars_per_iter
1337 : 29 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1338 : 29 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1339 : :
1340 : 29 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1341 : : {
1342 : 29 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1343 : 29 : rgm->type = truth_type_for (vectype);
1344 : 29 : rgm->factor = 1;
1345 : : }
1346 : : }
1347 : :
1348 : 23 : unsigned int max_nscalars_per_iter
1349 : 23 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1350 : :
1351 : : /* Work out how many bits we need to represent the limit. */
1352 : 23 : min_ni_width
1353 : 23 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1354 : :
1355 : : /* Find a scalar mode for which WHILE_ULT is supported. */
1356 : 23 : opt_scalar_int_mode cmp_mode_iter;
1357 : 23 : tree cmp_type = NULL_TREE;
1358 : 23 : tree iv_type = NULL_TREE;
1359 : 23 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1360 : 23 : unsigned int iv_precision = UINT_MAX;
1361 : :
1362 : 23 : if (iv_limit != -1)
1363 : 23 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1364 : : UNSIGNED);
1365 : :
1366 : 184 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1367 : : {
1368 : 161 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1369 : 161 : if (cmp_bits >= min_ni_width
1370 : 161 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1371 : : {
1372 : 97 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1373 : 97 : if (this_type
1374 : 97 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1375 : : {
1376 : : /* Although we could stop as soon as we find a valid mode,
1377 : : there are at least two reasons why that's not always the
1378 : : best choice:
1379 : :
1380 : : - An IV that's Pmode or wider is more likely to be reusable
1381 : : in address calculations than an IV that's narrower than
1382 : : Pmode.
1383 : :
1384 : : - Doing the comparison in IV_PRECISION or wider allows
1385 : : a natural 0-based IV, whereas using a narrower comparison
1386 : : type requires mitigations against wrap-around.
1387 : :
1388 : : Conversely, if the IV limit is variable, doing the comparison
1389 : : in a wider type than the original type can introduce
1390 : : unnecessary extensions, so picking the widest valid mode
1391 : : is not always a good choice either.
1392 : :
1393 : : Here we prefer the first IV type that's Pmode or wider,
1394 : : and the first comparison type that's IV_PRECISION or wider.
1395 : : (The comparison type must be no wider than the IV type,
1396 : : to avoid extensions in the vector loop.)
1397 : :
1398 : : ??? We might want to try continuing beyond Pmode for ILP32
1399 : : targets if CMP_BITS < IV_PRECISION. */
1400 : 0 : iv_type = this_type;
1401 : 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1402 : : cmp_type = this_type;
1403 : 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1404 : : break;
1405 : : }
1406 : : }
1407 : : }
1408 : :
1409 : 23 : if (!cmp_type)
1410 : : {
1411 : 23 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1412 : 23 : return false;
1413 : : }
1414 : :
1415 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1416 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1417 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1418 : 0 : return true;
1419 : 23 : }
1420 : :
1421 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1422 : : whether we can actually generate AVX512 style masks. Return true if so,
1423 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1424 : :
1425 : : static bool
1426 : 23 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1427 : : {
1428 : : /* Produce differently organized rgc_vec and differently check
1429 : : we can produce masks. */
1430 : :
1431 : : /* Use a normal loop if there are no statements that need masking.
1432 : : This only happens in rare degenerate cases: it means that the loop
1433 : : has no loads, no stores, and no live-out values. */
1434 : 23 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1435 : : return false;
1436 : :
1437 : : /* For the decrementing IV we need to represent all values in
1438 : : [0, niter + niter_skip] where niter_skip is the elements we
1439 : : skip in the first iteration for prologue peeling. */
1440 : 23 : tree iv_type = NULL_TREE;
1441 : 23 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1442 : 23 : unsigned int iv_precision = UINT_MAX;
1443 : 23 : if (iv_limit != -1)
1444 : 23 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1445 : :
1446 : : /* First compute the type for the IV we use to track the remaining
1447 : : scalar iterations. */
1448 : 23 : opt_scalar_int_mode cmp_mode_iter;
1449 : 41 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1450 : : {
1451 : 41 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1452 : 41 : if (cmp_bits >= iv_precision
1453 : 41 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1454 : : {
1455 : 23 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1456 : 23 : if (iv_type)
1457 : : break;
1458 : : }
1459 : : }
1460 : 23 : if (!iv_type)
1461 : : return false;
1462 : :
1463 : : /* Produce the rgroup controls. */
1464 : 81 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1465 : : {
1466 : 29 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1467 : 29 : tree vectype = mask.first;
1468 : 29 : unsigned nvectors = mask.second;
1469 : :
1470 : : /* The number of scalars per iteration and the number of vectors are
1471 : : both compile-time constants. */
1472 : 29 : unsigned int nscalars_per_iter
1473 : 29 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1474 : 29 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1475 : :
1476 : : /* We index the rgroup_controls vector with nscalars_per_iter
1477 : : which we keep constant and instead have a varying nvectors,
1478 : : remembering the vector mask with the fewest nV. */
1479 : 35 : if (masks->rgc_vec.length () < nscalars_per_iter)
1480 : 23 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1481 : 29 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1482 : :
1483 : 29 : if (!rgm->type || rgm->factor > nvectors)
1484 : : {
1485 : 25 : rgm->type = truth_type_for (vectype);
1486 : 25 : rgm->compare_type = NULL_TREE;
1487 : 25 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1488 : 25 : rgm->factor = nvectors;
1489 : 25 : rgm->bias_adjusted_ctrl = NULL_TREE;
1490 : : }
1491 : : }
1492 : :
1493 : : /* There is no fixed compare type we are going to use but we have to
1494 : : be able to get at one for each mask group. */
1495 : 23 : unsigned int min_ni_width
1496 : 23 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1497 : :
1498 : 23 : bool ok = true;
1499 : 94 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1500 : : {
1501 : 25 : tree mask_type = rgc.type;
1502 : 25 : if (!mask_type)
1503 : 2 : continue;
1504 : :
1505 : : /* For now vect_get_loop_mask only supports integer mode masks
1506 : : when we need to split it. */
1507 : 23 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1508 : 23 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1509 : : {
1510 : : ok = false;
1511 : : break;
1512 : : }
1513 : :
1514 : : /* If iv_type is usable as compare type use that - we can elide the
1515 : : saturation in that case. */
1516 : 23 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1517 : : {
1518 : 23 : tree cmp_vectype
1519 : 23 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1520 : 23 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1521 : 1 : rgc.compare_type = cmp_vectype;
1522 : : }
1523 : 23 : if (!rgc.compare_type)
1524 : 55 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1525 : : {
1526 : 55 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1527 : 55 : if (cmp_bits >= min_ni_width
1528 : 55 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1529 : : {
1530 : 55 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1531 : 55 : if (!cmp_type)
1532 : 0 : continue;
1533 : :
1534 : : /* Check whether we can produce the mask with cmp_type. */
1535 : 55 : tree cmp_vectype
1536 : 55 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1537 : 55 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1538 : : {
1539 : 22 : rgc.compare_type = cmp_vectype;
1540 : 22 : break;
1541 : : }
1542 : : }
1543 : : }
1544 : 23 : if (!rgc.compare_type)
1545 : : {
1546 : : ok = false;
1547 : : break;
1548 : : }
1549 : : }
1550 : 23 : if (!ok)
1551 : : {
1552 : 0 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1553 : 0 : return false;
1554 : : }
1555 : :
1556 : 23 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1557 : 23 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1558 : 23 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1559 : 23 : return true;
1560 : 23 : }
1561 : :
1562 : : /* Check whether we can use vector access with length based on precison
1563 : : comparison. So far, to keep it simple, we only allow the case that the
1564 : : precision of the target supported length is larger than the precision
1565 : : required by loop niters. */
1566 : :
1567 : : static bool
1568 : 0 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1569 : : {
1570 : 0 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1571 : : return false;
1572 : :
1573 : 0 : if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
1574 : : return false;
1575 : :
1576 : 0 : machine_mode len_load_mode, len_store_mode;
1577 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1578 : 0 : .exists (&len_load_mode))
1579 : 0 : return false;
1580 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1581 : 0 : .exists (&len_store_mode))
1582 : 0 : return false;
1583 : :
1584 : 0 : signed char partial_load_bias = internal_len_load_store_bias
1585 : 0 : (IFN_LEN_LOAD, len_load_mode);
1586 : :
1587 : 0 : signed char partial_store_bias = internal_len_load_store_bias
1588 : 0 : (IFN_LEN_STORE, len_store_mode);
1589 : :
1590 : 0 : gcc_assert (partial_load_bias == partial_store_bias);
1591 : :
1592 : 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1593 : : return false;
1594 : :
1595 : : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1596 : : len_loads with a length of zero. In order to avoid that we prohibit
1597 : : more than one loop length here. */
1598 : 0 : if (partial_load_bias == -1
1599 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1600 : : return false;
1601 : :
1602 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1603 : :
1604 : 0 : unsigned int max_nitems_per_iter = 1;
1605 : 0 : unsigned int i;
1606 : 0 : rgroup_controls *rgl;
1607 : : /* Find the maximum number of items per iteration for every rgroup. */
1608 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1609 : : {
1610 : 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1611 : 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1612 : : }
1613 : :
1614 : : /* Work out how many bits we need to represent the length limit. */
1615 : 0 : unsigned int min_ni_prec
1616 : 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1617 : :
1618 : : /* Now use the maximum of below precisions for one suitable IV type:
1619 : : - the IV's natural precision
1620 : : - the precision needed to hold: the maximum number of scalar
1621 : : iterations multiplied by the scale factor (min_ni_prec above)
1622 : : - the Pmode precision
1623 : :
1624 : : If min_ni_prec is less than the precision of the current niters,
1625 : : we perfer to still use the niters type. Prefer to use Pmode and
1626 : : wider IV to avoid narrow conversions. */
1627 : :
1628 : 0 : unsigned int ni_prec
1629 : 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1630 : 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1631 : 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1632 : :
1633 : 0 : tree iv_type = NULL_TREE;
1634 : 0 : opt_scalar_int_mode tmode_iter;
1635 : 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1636 : : {
1637 : 0 : scalar_mode tmode = tmode_iter.require ();
1638 : 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1639 : :
1640 : : /* ??? Do we really want to construct one IV whose precision exceeds
1641 : : BITS_PER_WORD? */
1642 : 0 : if (tbits > BITS_PER_WORD)
1643 : : break;
1644 : :
1645 : : /* Find the first available standard integral type. */
1646 : 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1647 : : {
1648 : 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1649 : 0 : break;
1650 : : }
1651 : : }
1652 : :
1653 : 0 : if (!iv_type)
1654 : : {
1655 : 0 : if (dump_enabled_p ())
1656 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1657 : : "can't vectorize with length-based partial vectors"
1658 : : " because there is no suitable iv type.\n");
1659 : 0 : return false;
1660 : : }
1661 : :
1662 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1663 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1664 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1665 : :
1666 : 0 : return true;
1667 : : }
1668 : :
1669 : : /* Calculate the cost of one scalar iteration of the loop. */
1670 : : static void
1671 : 221199 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1672 : : {
1673 : 221199 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674 : 221199 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1675 : 221199 : int nbbs = loop->num_nodes, factor;
1676 : 221199 : int innerloop_iters, i;
1677 : :
1678 : 221199 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1679 : :
1680 : : /* Gather costs for statements in the scalar loop. */
1681 : :
1682 : : /* FORNOW. */
1683 : 221199 : innerloop_iters = 1;
1684 : 221199 : if (loop->inner)
1685 : 1184 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1686 : :
1687 : 775742 : for (i = 0; i < nbbs; i++)
1688 : : {
1689 : 554543 : gimple_stmt_iterator si;
1690 : 554543 : basic_block bb = bbs[i];
1691 : :
1692 : 554543 : if (bb->loop_father == loop->inner)
1693 : : factor = innerloop_iters;
1694 : : else
1695 : 552175 : factor = 1;
1696 : :
1697 : 4367794 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1698 : : {
1699 : 3258708 : gimple *stmt = gsi_stmt (si);
1700 : 3258708 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1701 : :
1702 : 3258708 : if (!is_gimple_assign (stmt)
1703 : : && !is_gimple_call (stmt)
1704 : : && !is_a<gcond *> (stmt))
1705 : 1033991 : continue;
1706 : :
1707 : : /* Skip stmts that are not vectorized inside the loop. */
1708 : 2224717 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1709 : 2224717 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1710 : 972336 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1711 : 77 : || !VECTORIZABLE_CYCLE_DEF
1712 : : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1713 : 972336 : continue;
1714 : :
1715 : 1252381 : vect_cost_for_stmt kind;
1716 : 1252381 : if (STMT_VINFO_DATA_REF (stmt_info))
1717 : : {
1718 : 535963 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1719 : : kind = scalar_load;
1720 : : else
1721 : 201953 : kind = scalar_store;
1722 : : }
1723 : 716418 : else if (vect_nop_conversion_p (stmt_info))
1724 : 33254 : continue;
1725 : : else
1726 : : kind = scalar_stmt;
1727 : :
1728 : : /* We are using vect_prologue here to avoid scaling twice
1729 : : by the inner loop factor. */
1730 : 1219127 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1731 : : factor, kind, stmt_info, 0, vect_prologue);
1732 : : }
1733 : : }
1734 : :
1735 : : /* Now accumulate cost. */
1736 : 221199 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1737 : 221199 : add_stmt_costs (loop_vinfo->scalar_costs,
1738 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1739 : 221199 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1740 : 221199 : }
1741 : :
1742 : : /* Function vect_analyze_loop_form.
1743 : :
1744 : : Verify that certain CFG restrictions hold, including:
1745 : : - the loop has a pre-header
1746 : : - the loop has a single entry
1747 : : - nested loops can have only a single exit.
1748 : : - the loop exit condition is simple enough
1749 : : - the number of iterations can be analyzed, i.e, a countable loop. The
1750 : : niter could be analyzed under some assumptions. */
1751 : :
1752 : : opt_result
1753 : 456593 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1754 : : vect_loop_form_info *info)
1755 : : {
1756 : 456593 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1757 : :
1758 : 456593 : edge exit_e = vec_init_loop_exit_info (loop);
1759 : 456593 : if (!exit_e)
1760 : 56694 : return opt_result::failure_at (vect_location,
1761 : : "not vectorized:"
1762 : : " could not determine main exit from"
1763 : : " loop with multiple exits.\n");
1764 : 399899 : if (loop_vectorized_call)
1765 : : {
1766 : 25746 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1767 : 25746 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1768 : 25746 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1769 : 25746 : if (!scalar_exit_e)
1770 : 0 : return opt_result::failure_at (vect_location,
1771 : : "not vectorized:"
1772 : : " could not determine main exit from"
1773 : : " loop with multiple exits.\n");
1774 : : }
1775 : :
1776 : 399899 : info->loop_exit = exit_e;
1777 : 399899 : if (dump_enabled_p ())
1778 : 14840 : dump_printf_loc (MSG_NOTE, vect_location,
1779 : : "using as main loop exit: %d -> %d [AUX: %p]\n",
1780 : 14840 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1781 : :
1782 : : /* Check if we have any control flow that doesn't leave the loop. */
1783 : 399899 : basic_block *bbs = get_loop_body (loop);
1784 : 1343737 : for (unsigned i = 0; i < loop->num_nodes; i++)
1785 : 1051268 : if (EDGE_COUNT (bbs[i]->succs) != 1
1786 : 1051268 : && (EDGE_COUNT (bbs[i]->succs) != 2
1787 : 621361 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1788 : : {
1789 : 107430 : free (bbs);
1790 : 107430 : return opt_result::failure_at (vect_location,
1791 : : "not vectorized:"
1792 : : " unsupported control flow in loop.\n");
1793 : : }
1794 : 292469 : free (bbs);
1795 : :
1796 : : /* Different restrictions apply when we are considering an inner-most loop,
1797 : : vs. an outer (nested) loop.
1798 : : (FORNOW. May want to relax some of these restrictions in the future). */
1799 : :
1800 : 292469 : info->inner_loop_cond = NULL;
1801 : 292469 : if (!loop->inner)
1802 : : {
1803 : : /* Inner-most loop. */
1804 : :
1805 : 271326 : if (empty_block_p (loop->header))
1806 : 3 : return opt_result::failure_at (vect_location,
1807 : : "not vectorized: empty loop.\n");
1808 : : }
1809 : : else
1810 : : {
1811 : 21143 : class loop *innerloop = loop->inner;
1812 : 21143 : edge entryedge;
1813 : :
1814 : : /* Nested loop. We currently require that the loop is doubly-nested,
1815 : : contains a single inner loop with a single exit to the block
1816 : : with the single exit condition in the outer loop.
1817 : : Vectorizable outer-loops look like this:
1818 : :
1819 : : (pre-header)
1820 : : |
1821 : : header <---+
1822 : : | |
1823 : : inner-loop |
1824 : : | |
1825 : : tail ------+
1826 : : |
1827 : : (exit-bb)
1828 : :
1829 : : The inner-loop also has the properties expected of inner-most loops
1830 : : as described above. */
1831 : :
1832 : 21143 : if ((loop->inner)->inner || (loop->inner)->next)
1833 : 2971 : return opt_result::failure_at (vect_location,
1834 : : "not vectorized:"
1835 : : " multiple nested loops.\n");
1836 : :
1837 : 18172 : entryedge = loop_preheader_edge (innerloop);
1838 : 18172 : if (entryedge->src != loop->header
1839 : 17827 : || !single_exit (innerloop)
1840 : 29261 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1841 : 7365 : return opt_result::failure_at (vect_location,
1842 : : "not vectorized:"
1843 : : " unsupported outerloop form.\n");
1844 : :
1845 : : /* Analyze the inner-loop. */
1846 : 10807 : vect_loop_form_info inner;
1847 : 10807 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1848 : 10807 : if (!res)
1849 : : {
1850 : 1189 : if (dump_enabled_p ())
1851 : 5 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1852 : : "not vectorized: Bad inner loop.\n");
1853 : 1189 : return res;
1854 : : }
1855 : :
1856 : : /* Don't support analyzing niter under assumptions for inner
1857 : : loop. */
1858 : 9618 : if (!integer_onep (inner.assumptions))
1859 : 287 : return opt_result::failure_at (vect_location,
1860 : : "not vectorized: Bad inner loop.\n");
1861 : :
1862 : 9331 : if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1863 : 1083 : return opt_result::failure_at (vect_location,
1864 : : "not vectorized: inner-loop count not"
1865 : : " invariant.\n");
1866 : :
1867 : 8248 : if (dump_enabled_p ())
1868 : 924 : dump_printf_loc (MSG_NOTE, vect_location,
1869 : : "Considering outer-loop vectorization.\n");
1870 : 8248 : info->inner_loop_cond = inner.conds[0];
1871 : 10807 : }
1872 : :
1873 : 279571 : if (EDGE_COUNT (loop->header->preds) != 2)
1874 : 0 : return opt_result::failure_at (vect_location,
1875 : : "not vectorized:"
1876 : : " too many incoming edges.\n");
1877 : :
1878 : : /* We assume that the latch is empty. */
1879 : 279571 : basic_block latch = loop->latch;
1880 : 279571 : do
1881 : : {
1882 : 279571 : if (!empty_block_p (latch)
1883 : 279571 : || !gimple_seq_empty_p (phi_nodes (latch)))
1884 : 19999 : return opt_result::failure_at (vect_location,
1885 : : "not vectorized: latch block not "
1886 : : "empty.\n");
1887 : 259572 : latch = single_pred (latch);
1888 : : }
1889 : 519144 : while (single_succ_p (latch));
1890 : :
1891 : : /* Make sure there is no abnormal exit. */
1892 : 259572 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1893 : 1146652 : for (edge e : exits)
1894 : : {
1895 : 367980 : if (e->flags & EDGE_ABNORMAL)
1896 : 44 : return opt_result::failure_at (vect_location,
1897 : : "not vectorized:"
1898 : : " abnormal loop exit edge.\n");
1899 : : }
1900 : :
1901 : 259528 : info->conds
1902 : 259528 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1903 : : &info->number_of_iterations,
1904 : 259528 : &info->number_of_iterationsm1);
1905 : 259528 : if (info->conds.is_empty ())
1906 : 32 : return opt_result::failure_at
1907 : 32 : (vect_location,
1908 : : "not vectorized: complicated exit condition.\n");
1909 : :
1910 : : /* Determine what the primary and alternate exit conds are. */
1911 : 617239 : for (unsigned i = 0; i < info->conds.length (); i++)
1912 : : {
1913 : 357743 : gcond *cond = info->conds[i];
1914 : 357743 : if (exit_e->src == gimple_bb (cond))
1915 : 259496 : std::swap (info->conds[0], info->conds[i]);
1916 : : }
1917 : :
1918 : 259496 : if (integer_zerop (info->assumptions)
1919 : 259496 : || !info->number_of_iterations
1920 : 518992 : || chrec_contains_undetermined (info->number_of_iterations))
1921 : 40282 : return opt_result::failure_at
1922 : 40282 : (info->conds[0],
1923 : : "not vectorized: number of iterations cannot be computed.\n");
1924 : :
1925 : 219214 : if (integer_zerop (info->number_of_iterations))
1926 : 14 : return opt_result::failure_at
1927 : 14 : (info->conds[0],
1928 : : "not vectorized: number of iterations = 0.\n");
1929 : :
1930 : 219200 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1931 : 118572 : && tree_to_shwi (info->number_of_iterations) > 0))
1932 : : {
1933 : 100628 : if (dump_enabled_p ())
1934 : : {
1935 : 2271 : dump_printf_loc (MSG_NOTE, vect_location,
1936 : : "Symbolic number of iterations is ");
1937 : 2271 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1938 : 2271 : dump_printf (MSG_NOTE, "\n");
1939 : : }
1940 : : }
1941 : :
1942 : 219200 : return opt_result::success ();
1943 : 259572 : }
1944 : :
1945 : : /* Create a loop_vec_info for LOOP with SHARED and the
1946 : : vect_analyze_loop_form result. */
1947 : :
1948 : : loop_vec_info
1949 : 407794 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1950 : : const vect_loop_form_info *info,
1951 : : loop_vec_info orig_loop_info)
1952 : : {
1953 : 407794 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1954 : 407794 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1955 : 407794 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1956 : 407794 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1957 : 407794 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1958 : 407794 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1959 : 145 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1960 : 145 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1961 : : else
1962 : 407649 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1963 : : /* Also record the assumptions for versioning. */
1964 : 407794 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1965 : 21400 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1966 : :
1967 : 1846799 : for (gcond *cond : info->conds)
1968 : : {
1969 : 623417 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1970 : 623417 : STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1971 : : /* Mark the statement as a condition. */
1972 : 623417 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1973 : : }
1974 : :
1975 : 623417 : for (unsigned i = 1; i < info->conds.length (); i ++)
1976 : 215623 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1977 : 407794 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1978 : :
1979 : 407794 : LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1980 : :
1981 : : /* Check to see if we're vectorizing multiple exits. */
1982 : 407794 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1983 : 407794 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1984 : :
1985 : 407794 : if (info->inner_loop_cond)
1986 : : {
1987 : 8437 : stmt_vec_info inner_loop_cond_info
1988 : 8437 : = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1989 : 8437 : STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1990 : : /* If we have an estimate on the number of iterations of the inner
1991 : : loop use that to limit the scale for costing, otherwise use
1992 : : --param vect-inner-loop-cost-factor literally. */
1993 : 8437 : widest_int nit;
1994 : 8437 : if (estimated_stmt_executions (loop->inner, &nit))
1995 : 7218 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1996 : 7218 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1997 : 8437 : }
1998 : :
1999 : 407794 : return loop_vinfo;
2000 : : }
2001 : :
2002 : :
2003 : :
2004 : : /* Scan the loop stmts and dependent on whether there are any (non-)SLP
2005 : : statements update the vectorization factor. */
2006 : :
2007 : : static void
2008 : 344892 : vect_update_vf_for_slp (loop_vec_info loop_vinfo)
2009 : : {
2010 : 344892 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2011 : 344892 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2012 : 344892 : int nbbs = loop->num_nodes;
2013 : 344892 : poly_uint64 vectorization_factor;
2014 : 344892 : int i;
2015 : :
2016 : 344892 : DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
2017 : :
2018 : 344892 : vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2019 : 344892 : gcc_assert (known_ne (vectorization_factor, 0U));
2020 : :
2021 : : /* If all the stmts in the loop can be SLPed, we perform only SLP, and
2022 : : vectorization factor of the loop is the unrolling factor required by
2023 : : the SLP instances. If that unrolling factor is 1, we say, that we
2024 : : perform pure SLP on loop - cross iteration parallelism is not
2025 : : exploited. */
2026 : : bool only_slp_in_loop = true;
2027 : 1251798 : for (i = 0; i < nbbs; i++)
2028 : : {
2029 : 906906 : basic_block bb = bbs[i];
2030 : 1773254 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2031 : 866348 : gsi_next (&si))
2032 : : {
2033 : 866348 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
2034 : 866348 : if (!stmt_info)
2035 : 0 : continue;
2036 : 866348 : if ((STMT_VINFO_RELEVANT_P (stmt_info)
2037 : 433552 : || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2038 : 432808 : && !PURE_SLP_STMT (stmt_info))
2039 : : /* STMT needs both SLP and loop-based vectorization. */
2040 : 866348 : only_slp_in_loop = false;
2041 : : }
2042 : 6851355 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2043 : 5037543 : gsi_next (&si))
2044 : : {
2045 : 5037543 : if (is_gimple_debug (gsi_stmt (si)))
2046 : 1678850 : continue;
2047 : 3358693 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2048 : 3358693 : stmt_info = vect_stmt_to_vectorize (stmt_info);
2049 : 3358693 : if ((STMT_VINFO_RELEVANT_P (stmt_info)
2050 : 1397516 : || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2051 : 1961181 : && !PURE_SLP_STMT (stmt_info))
2052 : : /* STMT needs both SLP and loop-based vectorization. */
2053 : 5037543 : only_slp_in_loop = false;
2054 : : }
2055 : : }
2056 : :
2057 : 344892 : if (only_slp_in_loop)
2058 : : {
2059 : 339676 : if (dump_enabled_p ())
2060 : 17738 : dump_printf_loc (MSG_NOTE, vect_location,
2061 : : "Loop contains only SLP stmts\n");
2062 : 339676 : vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2063 : : }
2064 : : else
2065 : : {
2066 : 5216 : if (dump_enabled_p ())
2067 : 252 : dump_printf_loc (MSG_NOTE, vect_location,
2068 : : "Loop contains SLP and non-SLP stmts\n");
2069 : : /* Both the vectorization factor and unroll factor have the form
2070 : : GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2071 : : so they must have a common multiple. */
2072 : 5216 : vectorization_factor
2073 : 5216 : = force_common_multiple (vectorization_factor,
2074 : 5216 : LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2075 : : }
2076 : :
2077 : 344892 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2078 : 344892 : if (dump_enabled_p ())
2079 : : {
2080 : 17990 : dump_printf_loc (MSG_NOTE, vect_location,
2081 : : "Updating vectorization factor to ");
2082 : 17990 : dump_dec (MSG_NOTE, vectorization_factor);
2083 : 17990 : dump_printf (MSG_NOTE, ".\n");
2084 : : }
2085 : 344892 : }
2086 : :
2087 : : /* Return true if STMT_INFO describes a double reduction phi and if
2088 : : the other phi in the reduction is also relevant for vectorization.
2089 : : This rejects cases such as:
2090 : :
2091 : : outer1:
2092 : : x_1 = PHI <x_3(outer2), ...>;
2093 : : ...
2094 : :
2095 : : inner:
2096 : : x_2 = ...;
2097 : : ...
2098 : :
2099 : : outer2:
2100 : : x_3 = PHI <x_2(inner)>;
2101 : :
2102 : : if nothing in x_2 or elsewhere makes x_1 relevant. */
2103 : :
2104 : : static bool
2105 : 157 : vect_active_double_reduction_p (stmt_vec_info stmt_info)
2106 : : {
2107 : 157 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2108 : : return false;
2109 : :
2110 : 0 : return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2111 : : }
2112 : :
2113 : : /* Function vect_analyze_loop_operations.
2114 : :
2115 : : Scan the loop stmts and make sure they are all vectorizable. */
2116 : :
2117 : : static opt_result
2118 : 106390 : vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2119 : : {
2120 : 106390 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2121 : 106390 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2122 : 106390 : int nbbs = loop->num_nodes;
2123 : 106390 : int i;
2124 : 106390 : stmt_vec_info stmt_info;
2125 : :
2126 : 106390 : DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2127 : :
2128 : 320775 : for (i = 0; i < nbbs; i++)
2129 : : {
2130 : 215781 : basic_block bb = bbs[i];
2131 : :
2132 : 518677 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2133 : 302896 : gsi_next (&si))
2134 : : {
2135 : 304206 : gphi *phi = si.phi ();
2136 : :
2137 : 304206 : stmt_info = loop_vinfo->lookup_stmt (phi);
2138 : 304206 : if (dump_enabled_p ())
2139 : 34591 : dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2140 : : (gimple *) phi);
2141 : 608412 : if (virtual_operand_p (gimple_phi_result (phi)))
2142 : 83539 : continue;
2143 : :
2144 : : /* ??? All of the below unconditional FAILs should be in
2145 : : done earlier after analyzing cycles, possibly when
2146 : : determining stmt relevancy? */
2147 : :
2148 : : /* Inner-loop loop-closed exit phi in outer-loop vectorization
2149 : : (i.e., a phi in the tail of the outer-loop). */
2150 : 220667 : if (! is_loop_header_bb_p (bb))
2151 : : {
2152 : : /* FORNOW: we currently don't support the case that these phis
2153 : : are not used in the outerloop (unless it is double reduction,
2154 : : i.e., this phi is vect_reduction_def), cause this case
2155 : : requires to actually do something here. */
2156 : 769 : if (STMT_VINFO_LIVE_P (stmt_info)
2157 : 872 : && !vect_active_double_reduction_p (stmt_info))
2158 : 54 : return opt_result::failure_at (phi,
2159 : : "Unsupported loop-closed phi"
2160 : : " in outer-loop.\n");
2161 : :
2162 : : /* If PHI is used in the outer loop, we check that its operand
2163 : : is defined in the inner loop. */
2164 : 715 : if (STMT_VINFO_RELEVANT_P (stmt_info))
2165 : : {
2166 : 711 : tree phi_op;
2167 : :
2168 : 711 : if (gimple_phi_num_args (phi) != 1)
2169 : 0 : return opt_result::failure_at (phi, "unsupported phi");
2170 : :
2171 : 711 : phi_op = PHI_ARG_DEF (phi, 0);
2172 : 711 : stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2173 : 711 : if (!op_def_info)
2174 : 0 : return opt_result::failure_at (phi, "unsupported phi\n");
2175 : :
2176 : 711 : if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2177 : 711 : && (STMT_VINFO_RELEVANT (op_def_info)
2178 : : != vect_used_in_outer_by_reduction))
2179 : 240 : return opt_result::failure_at (phi, "unsupported phi\n");
2180 : :
2181 : 471 : if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2182 : 103 : || (STMT_VINFO_DEF_TYPE (stmt_info)
2183 : : == vect_double_reduction_def))
2184 : 471 : && ! PURE_SLP_STMT (stmt_info))
2185 : 0 : return opt_result::failure_at (phi, "unsupported phi\n");
2186 : : }
2187 : :
2188 : 475 : continue;
2189 : 475 : }
2190 : :
2191 : 219898 : gcc_assert (stmt_info);
2192 : :
2193 : 219898 : if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2194 : 199290 : || STMT_VINFO_LIVE_P (stmt_info))
2195 : 23811 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2196 : 184 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2197 : : /* A scalar-dependence cycle that we don't support. */
2198 : 0 : return opt_result::failure_at (phi,
2199 : : "not vectorized:"
2200 : : " scalar dependence cycle.\n");
2201 : :
2202 : 219898 : if (STMT_VINFO_RELEVANT_P (stmt_info)
2203 : 66856 : && ! PURE_SLP_STMT (stmt_info))
2204 : 1016 : return opt_result::failure_at (phi,
2205 : : "not vectorized: relevant phi not "
2206 : : "supported: %G",
2207 : : static_cast <gimple *> (phi));
2208 : : }
2209 : :
2210 : 1833950 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2211 : 1405008 : gsi_next (&si))
2212 : : {
2213 : 1405094 : gimple *stmt = gsi_stmt (si);
2214 : 1405094 : if (!gimple_clobber_p (stmt)
2215 : 1405094 : && !is_gimple_debug (stmt))
2216 : : {
2217 : 1108505 : bool need_to_vectorize = false;
2218 : 1108505 : opt_result res
2219 : 1108505 : = vect_analyze_stmt (loop_vinfo,
2220 : : loop_vinfo->lookup_stmt (stmt),
2221 : : &need_to_vectorize,
2222 : : NULL, NULL, NULL);
2223 : 1108505 : if (!res)
2224 : 86 : return res;
2225 : : }
2226 : : }
2227 : : } /* bbs */
2228 : :
2229 : 104994 : return opt_result::success ();
2230 : : }
2231 : :
2232 : : /* Return true if we know that the iteration count is smaller than the
2233 : : vectorization factor. Return false if it isn't, or if we can't be sure
2234 : : either way. */
2235 : :
2236 : : static bool
2237 : 104412 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2238 : : {
2239 : 104412 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2240 : :
2241 : 104412 : HOST_WIDE_INT max_niter;
2242 : 104412 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2243 : 49952 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2244 : : else
2245 : 54460 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2246 : :
2247 : 104412 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2248 : 8033 : return true;
2249 : :
2250 : : return false;
2251 : : }
2252 : :
2253 : : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2254 : : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2255 : : definitely no, or -1 if it's worth retrying. */
2256 : :
2257 : : static int
2258 : 104418 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2259 : : unsigned *suggested_unroll_factor)
2260 : : {
2261 : 104418 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2262 : 104418 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2263 : :
2264 : : /* Only loops that can handle partially-populated vectors can have iteration
2265 : : counts less than the vectorization factor. */
2266 : 104418 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2267 : 104418 : && vect_known_niters_smaller_than_vf (loop_vinfo))
2268 : : {
2269 : 8025 : if (dump_enabled_p ())
2270 : 222 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271 : : "not vectorized: iteration count smaller than "
2272 : : "vectorization factor.\n");
2273 : 8025 : return 0;
2274 : : }
2275 : :
2276 : : /* If we know the number of iterations we can do better, for the
2277 : : epilogue we can also decide whether the main loop leaves us
2278 : : with enough iterations, prefering a smaller vector epilog then
2279 : : also possibly used for the case we skip the vector loop. */
2280 : 96393 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2281 : : {
2282 : 42224 : widest_int scalar_niters
2283 : 42224 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2284 : 42224 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2285 : : {
2286 : 2575 : loop_vec_info orig_loop_vinfo
2287 : : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2288 : 2575 : loop_vec_info main_loop_vinfo
2289 : : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
2290 : 2575 : unsigned lowest_vf
2291 : 2575 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2292 : 2575 : int prolog_peeling = 0;
2293 : 2575 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
2294 : 2575 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
2295 : 2575 : if (prolog_peeling >= 0
2296 : 2575 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2297 : : lowest_vf))
2298 : : {
2299 : 5140 : unsigned gap
2300 : 2570 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
2301 : 5140 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
2302 : 5140 : % lowest_vf + gap);
2303 : : }
2304 : : }
2305 : : /* Reject vectorizing for a single scalar iteration, even if
2306 : : we could in principle implement that using partial vectors. */
2307 : 42224 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2308 : 42224 : if (scalar_niters <= peeling_gap + 1)
2309 : : {
2310 : 744 : if (dump_enabled_p ())
2311 : 162 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2312 : : "not vectorized: loop only has a single "
2313 : : "scalar iteration.\n");
2314 : 744 : return 0;
2315 : : }
2316 : :
2317 : 41480 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2318 : : {
2319 : : /* Check that the loop processes at least one full vector. */
2320 : 41471 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2321 : 41471 : if (known_lt (scalar_niters, vf))
2322 : : {
2323 : 354 : if (dump_enabled_p ())
2324 : 289 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2325 : : "loop does not have enough iterations "
2326 : : "to support vectorization.\n");
2327 : 394 : return 0;
2328 : : }
2329 : :
2330 : : /* If we need to peel an extra epilogue iteration to handle data
2331 : : accesses with gaps, check that there are enough scalar iterations
2332 : : available.
2333 : :
2334 : : The check above is redundant with this one when peeling for gaps,
2335 : : but the distinction is useful for diagnostics. */
2336 : 41117 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2337 : 41399 : && known_le (scalar_niters, vf))
2338 : : {
2339 : 40 : if (dump_enabled_p ())
2340 : 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2341 : : "loop does not have enough iterations "
2342 : : "to support peeling for gaps.\n");
2343 : 40 : return 0;
2344 : : }
2345 : : }
2346 : 42224 : }
2347 : :
2348 : : /* If using the "very cheap" model. reject cases in which we'd keep
2349 : : a copy of the scalar code (even if we might be able to vectorize it). */
2350 : 95255 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2351 : 95255 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2352 : 47078 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2353 : : {
2354 : 708 : if (dump_enabled_p ())
2355 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2356 : : "some scalar iterations would need to be peeled\n");
2357 : 708 : return 0;
2358 : : }
2359 : :
2360 : 94547 : int min_profitable_iters, min_profitable_estimate;
2361 : 94547 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2362 : : &min_profitable_estimate,
2363 : : suggested_unroll_factor);
2364 : :
2365 : 94547 : if (min_profitable_iters < 0)
2366 : : {
2367 : 24979 : if (dump_enabled_p ())
2368 : 18 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369 : : "not vectorized: vectorization not profitable.\n");
2370 : 24979 : if (dump_enabled_p ())
2371 : 18 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2372 : : "not vectorized: vector version will never be "
2373 : : "profitable.\n");
2374 : 24979 : return -1;
2375 : : }
2376 : :
2377 : 69568 : int min_scalar_loop_bound = (param_min_vect_loop_bound
2378 : 69568 : * assumed_vf);
2379 : :
2380 : : /* Use the cost model only if it is more conservative than user specified
2381 : : threshold. */
2382 : 69568 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2383 : : min_profitable_iters);
2384 : :
2385 : 69568 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2386 : :
2387 : 35198 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2388 : 104766 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2389 : : {
2390 : 398 : if (dump_enabled_p ())
2391 : 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2392 : : "not vectorized: vectorization not profitable.\n");
2393 : 398 : if (dump_enabled_p ())
2394 : 1 : dump_printf_loc (MSG_NOTE, vect_location,
2395 : : "not vectorized: iteration count smaller than user "
2396 : : "specified loop bound parameter or minimum profitable "
2397 : : "iterations (whichever is more conservative).\n");
2398 : 398 : return 0;
2399 : : }
2400 : :
2401 : : /* The static profitablity threshold min_profitable_estimate includes
2402 : : the cost of having to check at runtime whether the scalar loop
2403 : : should be used instead. If it turns out that we don't need or want
2404 : : such a check, the threshold we should use for the static estimate
2405 : : is simply the point at which the vector loop becomes more profitable
2406 : : than the scalar loop. */
2407 : 69170 : if (min_profitable_estimate > min_profitable_iters
2408 : 14909 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2409 : 14444 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2410 : 258 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2411 : 69428 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2412 : : {
2413 : 6 : if (dump_enabled_p ())
2414 : 2 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2415 : : " choice between the scalar and vector loops\n");
2416 : 6 : min_profitable_estimate = min_profitable_iters;
2417 : : }
2418 : :
2419 : : /* If the vector loop needs multiple iterations to be beneficial then
2420 : : things are probably too close to call, and the conservative thing
2421 : : would be to stick with the scalar code. */
2422 : 69170 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2423 : 69170 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2424 : : {
2425 : 8052 : if (dump_enabled_p ())
2426 : 177 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427 : : "one iteration of the vector loop would be"
2428 : : " more expensive than the equivalent number of"
2429 : : " iterations of the scalar loop\n");
2430 : 8052 : return 0;
2431 : : }
2432 : :
2433 : 61118 : HOST_WIDE_INT estimated_niter;
2434 : :
2435 : : /* If we are vectorizing an epilogue then we know the maximum number of
2436 : : scalar iterations it will cover is at least one lower than the
2437 : : vectorization factor of the main loop. */
2438 : 61118 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2439 : 10270 : estimated_niter
2440 : 10270 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2441 : : else
2442 : : {
2443 : 50848 : estimated_niter = estimated_stmt_executions_int (loop);
2444 : 50848 : if (estimated_niter == -1)
2445 : 19374 : estimated_niter = likely_max_stmt_executions_int (loop);
2446 : : }
2447 : 29644 : if (estimated_niter != -1
2448 : 59501 : && ((unsigned HOST_WIDE_INT) estimated_niter
2449 : 59501 : < MAX (th, (unsigned) min_profitable_estimate)))
2450 : : {
2451 : 4213 : if (dump_enabled_p ())
2452 : 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453 : : "not vectorized: estimated iteration count too "
2454 : : "small.\n");
2455 : 4213 : if (dump_enabled_p ())
2456 : 28 : dump_printf_loc (MSG_NOTE, vect_location,
2457 : : "not vectorized: estimated iteration count smaller "
2458 : : "than specified loop bound parameter or minimum "
2459 : : "profitable iterations (whichever is more "
2460 : : "conservative).\n");
2461 : 4213 : return -1;
2462 : : }
2463 : :
2464 : : return 1;
2465 : : }
2466 : :
2467 : : static opt_result
2468 : 217203 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2469 : : vec<data_reference_p> *datarefs)
2470 : : {
2471 : 658206 : for (unsigned i = 0; i < loop->num_nodes; i++)
2472 : 973568 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2473 : 3650841 : !gsi_end_p (gsi); gsi_next (&gsi))
2474 : : {
2475 : 3209838 : gimple *stmt = gsi_stmt (gsi);
2476 : 3209838 : if (is_gimple_debug (stmt))
2477 : 1179333 : continue;
2478 : 2030639 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2479 : : NULL, 0);
2480 : 2030639 : if (!res)
2481 : : {
2482 : 45915 : if (is_gimple_call (stmt) && loop->safelen)
2483 : : {
2484 : 400 : tree fndecl = gimple_call_fndecl (stmt), op;
2485 : 400 : if (fndecl == NULL_TREE
2486 : 400 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2487 : : {
2488 : 0 : fndecl = gimple_call_arg (stmt, 0);
2489 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2490 : 0 : fndecl = TREE_OPERAND (fndecl, 0);
2491 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2492 : : }
2493 : 400 : if (fndecl != NULL_TREE)
2494 : : {
2495 : 366 : cgraph_node *node = cgraph_node::get (fndecl);
2496 : 366 : if (node != NULL && node->simd_clones != NULL)
2497 : : {
2498 : 135 : unsigned int j, n = gimple_call_num_args (stmt);
2499 : 557 : for (j = 0; j < n; j++)
2500 : : {
2501 : 288 : op = gimple_call_arg (stmt, j);
2502 : 288 : if (DECL_P (op)
2503 : 288 : || (REFERENCE_CLASS_P (op)
2504 : 0 : && get_base_address (op)))
2505 : : break;
2506 : : }
2507 : 135 : op = gimple_call_lhs (stmt);
2508 : : /* Ignore #pragma omp declare simd functions
2509 : : if they don't have data references in the
2510 : : call stmt itself. */
2511 : 269 : if (j == n
2512 : 135 : && !(op
2513 : 124 : && (DECL_P (op)
2514 : 124 : || (REFERENCE_CLASS_P (op)
2515 : 0 : && get_base_address (op)))))
2516 : 134 : continue;
2517 : : }
2518 : : }
2519 : : }
2520 : 45781 : return res;
2521 : : }
2522 : : /* If dependence analysis will give up due to the limit on the
2523 : : number of datarefs stop here and fail fatally. */
2524 : 3475545 : if (datarefs->length ()
2525 : 1490821 : > (unsigned)param_loop_max_datarefs_for_datadeps)
2526 : 0 : return opt_result::failure_at (stmt, "exceeded param "
2527 : : "loop-max-datarefs-for-datadeps\n");
2528 : : }
2529 : 171422 : return opt_result::success ();
2530 : : }
2531 : :
2532 : : /* Look for SLP-only access groups and turn each individual access into its own
2533 : : group. */
2534 : : static void
2535 : 106390 : vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2536 : : {
2537 : 106390 : unsigned int i;
2538 : 106390 : struct data_reference *dr;
2539 : :
2540 : 106390 : DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2541 : :
2542 : 106390 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2543 : 476810 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2544 : : {
2545 : 268449 : gcc_assert (DR_REF (dr));
2546 : 268449 : stmt_vec_info stmt_info
2547 : 268449 : = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (DR_STMT (dr)));
2548 : :
2549 : : /* Check if the load is a part of an interleaving chain. */
2550 : 268449 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2551 : : {
2552 : 81626 : stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2553 : 81626 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2554 : 81626 : unsigned int group_size = DR_GROUP_SIZE (first_element);
2555 : :
2556 : : /* Check if SLP-only groups. */
2557 : 81626 : if (!STMT_SLP_TYPE (stmt_info)
2558 : 162 : && STMT_VINFO_SLP_VECT_ONLY (first_element))
2559 : : {
2560 : : /* Dissolve the group. */
2561 : 12 : STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2562 : :
2563 : 12 : stmt_vec_info vinfo = first_element;
2564 : 30 : while (vinfo)
2565 : : {
2566 : 18 : stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2567 : 18 : DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2568 : 18 : DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2569 : 18 : DR_GROUP_SIZE (vinfo) = 1;
2570 : 18 : if (STMT_VINFO_STRIDED_P (first_element)
2571 : : /* We cannot handle stores with gaps. */
2572 : 12 : || DR_IS_WRITE (dr_info->dr))
2573 : : {
2574 : 6 : STMT_VINFO_STRIDED_P (vinfo) = true;
2575 : 6 : DR_GROUP_GAP (vinfo) = 0;
2576 : : }
2577 : : else
2578 : 12 : DR_GROUP_GAP (vinfo) = group_size - 1;
2579 : : /* Duplicate and adjust alignment info, it needs to
2580 : : be present on each group leader, see dr_misalignment. */
2581 : 18 : if (vinfo != first_element)
2582 : : {
2583 : 6 : dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2584 : 6 : dr_info2->target_alignment = dr_info->target_alignment;
2585 : 6 : int misalignment = dr_info->misalignment;
2586 : 6 : if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2587 : : {
2588 : 0 : HOST_WIDE_INT diff
2589 : 0 : = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2590 : 0 : - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2591 : 0 : unsigned HOST_WIDE_INT align_c
2592 : 0 : = dr_info->target_alignment.to_constant ();
2593 : 0 : misalignment = (misalignment + diff) % align_c;
2594 : : }
2595 : 6 : dr_info2->misalignment = misalignment;
2596 : : }
2597 : : vinfo = next;
2598 : : }
2599 : : }
2600 : : }
2601 : : }
2602 : 106390 : }
2603 : :
2604 : : /* Determine if operating on full vectors for LOOP_VINFO might leave
2605 : : some scalar iterations still to do. If so, decide how we should
2606 : : handle those scalar iterations. The possibilities are:
2607 : :
2608 : : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2609 : : In this case:
2610 : :
2611 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2612 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2613 : : LOOP_VINFO_PEELING_FOR_NITER == false
2614 : :
2615 : : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2616 : : to handle the remaining scalar iterations. In this case:
2617 : :
2618 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2619 : : LOOP_VINFO_PEELING_FOR_NITER == true
2620 : :
2621 : : There are two choices:
2622 : :
2623 : : (2a) Consider vectorizing the epilogue loop at the same VF as the
2624 : : main loop, but using partial vectors instead of full vectors.
2625 : : In this case:
2626 : :
2627 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2628 : :
2629 : : (2b) Consider vectorizing the epilogue loop at lower VFs only.
2630 : : In this case:
2631 : :
2632 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2633 : : */
2634 : :
2635 : : opt_result
2636 : 111511 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2637 : : {
2638 : : /* Determine whether there would be any scalar iterations left over. */
2639 : 111511 : bool need_peeling_or_partial_vectors_p
2640 : 111511 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2641 : :
2642 : : /* Decide whether to vectorize the loop with partial vectors. */
2643 : 111511 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2644 : 111511 : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2645 : 111511 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2646 : 25 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2647 : 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2648 : 111511 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2649 : 25 : && need_peeling_or_partial_vectors_p)
2650 : : {
2651 : : /* For partial-vector-usage=1, try to push the handling of partial
2652 : : vectors to the epilogue, with the main loop continuing to operate
2653 : : on full vectors.
2654 : :
2655 : : If we are unrolling we also do not want to use partial vectors. This
2656 : : is to avoid the overhead of generating multiple masks and also to
2657 : : avoid having to execute entire iterations of FALSE masked instructions
2658 : : when dealing with one or less full iterations.
2659 : :
2660 : : ??? We could then end up failing to use partial vectors if we
2661 : : decide to peel iterations into a prologue, and if the main loop
2662 : : then ends up processing fewer than VF iterations. */
2663 : 20 : if ((param_vect_partial_vector_usage == 1
2664 : 6 : || loop_vinfo->suggested_unroll_factor > 1)
2665 : 14 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2666 : 30 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2667 : 2 : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2668 : : else
2669 : 18 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2670 : : }
2671 : :
2672 : 111511 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2673 : 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2674 : 0 : return opt_result::failure_at (vect_location,
2675 : : "not vectorized: loop needs but cannot "
2676 : : "use partial vectors\n");
2677 : :
2678 : 111511 : if (dump_enabled_p ())
2679 : 12340 : dump_printf_loc (MSG_NOTE, vect_location,
2680 : : "operating on %s vectors%s.\n",
2681 : 12340 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2682 : : ? "partial" : "full",
2683 : 12340 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2684 : : ? " for epilogue loop" : "");
2685 : :
2686 : 111511 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2687 : 223022 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2688 : 111511 : && need_peeling_or_partial_vectors_p);
2689 : :
2690 : : /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2691 : : analysis that we don't know whether the loop is vectorized by partial
2692 : : vectors (More details see tree-vect-loop-manip.cc).
2693 : :
2694 : : However, SELECT_VL vectorizaton style should only applied on partial
2695 : : vectorization since SELECT_VL is the GIMPLE IR that calculates the
2696 : : number of elements to be process for each iteration.
2697 : :
2698 : : After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2699 : : if it is not partial vectorized loop. */
2700 : 111511 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2701 : 111493 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2702 : :
2703 : 111511 : return opt_result::success ();
2704 : : }
2705 : :
2706 : : /* Function vect_analyze_loop_2.
2707 : :
2708 : : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2709 : : analyses will record information in some members of LOOP_VINFO. FATAL
2710 : : indicates if some analysis meets fatal error. If one non-NULL pointer
2711 : : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2712 : : worked out suggested unroll factor, while one NULL pointer shows it's
2713 : : going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2714 : : is to hold the slp decision when the suggested unroll factor is worked
2715 : : out. */
2716 : : static opt_result
2717 : 407076 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2718 : : unsigned *suggested_unroll_factor,
2719 : : unsigned& slp_done_for_suggested_uf)
2720 : : {
2721 : 407076 : opt_result ok = opt_result::success ();
2722 : 407076 : int res;
2723 : 407076 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2724 : 407076 : poly_uint64 min_vf = 2;
2725 : 407076 : loop_vec_info orig_loop_vinfo = NULL;
2726 : :
2727 : : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2728 : : loop_vec_info of the first vectorized loop. */
2729 : 407076 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2730 : 16917 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2731 : : else
2732 : : orig_loop_vinfo = loop_vinfo;
2733 : 16917 : gcc_assert (orig_loop_vinfo);
2734 : :
2735 : : /* The first group of checks is independent of the vector size. */
2736 : 407076 : fatal = true;
2737 : :
2738 : 407076 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2739 : 407076 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2740 : 5 : return opt_result::failure_at (vect_location,
2741 : : "not vectorized: simd if(0)\n");
2742 : :
2743 : : /* Find all data references in the loop (which correspond to vdefs/vuses)
2744 : : and analyze their evolution in the loop. */
2745 : :
2746 : 407071 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2747 : :
2748 : : /* Gather the data references and count stmts in the loop. */
2749 : 407071 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2750 : : {
2751 : 217203 : opt_result res
2752 : 217203 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2753 : : &LOOP_VINFO_DATAREFS (loop_vinfo));
2754 : 217203 : if (!res)
2755 : : {
2756 : 45781 : if (dump_enabled_p ())
2757 : 1465 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2758 : : "not vectorized: loop contains function "
2759 : : "calls or data references that cannot "
2760 : : "be analyzed\n");
2761 : 45781 : return res;
2762 : : }
2763 : 171422 : loop_vinfo->shared->save_datarefs ();
2764 : : }
2765 : : else
2766 : 189868 : loop_vinfo->shared->check_datarefs ();
2767 : :
2768 : : /* Analyze the data references and also adjust the minimal
2769 : : vectorization factor according to the loads and stores. */
2770 : :
2771 : 361290 : ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2772 : 361290 : if (!ok)
2773 : : {
2774 : 50970 : if (dump_enabled_p ())
2775 : 970 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776 : : "bad data references.\n");
2777 : 50970 : return ok;
2778 : : }
2779 : :
2780 : : /* Check if we are applying unroll factor now. */
2781 : 310320 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2782 : 310320 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2783 : :
2784 : : /* If the slp decision is false when suggested unroll factor is worked
2785 : : out, and we are applying suggested unroll factor, we can simply skip
2786 : : all slp related analyses this time. */
2787 : 310320 : unsigned slp = !applying_suggested_uf ? 2 : slp_done_for_suggested_uf;
2788 : :
2789 : : /* Classify all cross-iteration scalar data-flow cycles.
2790 : : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2791 : 310320 : vect_analyze_scalar_cycles (loop_vinfo, slp == 2);
2792 : :
2793 : 310320 : vect_pattern_recog (loop_vinfo);
2794 : :
2795 : 310320 : vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2796 : :
2797 : : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2798 : : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2799 : :
2800 : 310320 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2801 : 310320 : if (!ok)
2802 : : {
2803 : 6176 : if (dump_enabled_p ())
2804 : 260 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2805 : : "bad data access.\n");
2806 : 6176 : return ok;
2807 : : }
2808 : :
2809 : : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2810 : :
2811 : 304144 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2812 : 304144 : if (!ok)
2813 : : {
2814 : 12721 : if (dump_enabled_p ())
2815 : 327 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2816 : : "unexpected pattern.\n");
2817 : 12721 : return ok;
2818 : : }
2819 : :
2820 : : /* While the rest of the analysis below depends on it in some way. */
2821 : 291423 : fatal = false;
2822 : :
2823 : : /* Analyze data dependences between the data-refs in the loop
2824 : : and adjust the maximum vectorization factor according to
2825 : : the dependences.
2826 : : FORNOW: fail at the first data dependence that we encounter. */
2827 : :
2828 : 291423 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2829 : 291423 : if (!ok)
2830 : : {
2831 : 14037 : if (dump_enabled_p ())
2832 : 368 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2833 : : "bad data dependence.\n");
2834 : 14037 : return ok;
2835 : : }
2836 : 277386 : if (max_vf != MAX_VECTORIZATION_FACTOR
2837 : 277386 : && maybe_lt (max_vf, min_vf))
2838 : 46 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2839 : 277340 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2840 : :
2841 : 277340 : ok = vect_determine_vectorization_factor (loop_vinfo);
2842 : 277340 : if (!ok)
2843 : : {
2844 : 56141 : if (dump_enabled_p ())
2845 : 798 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2846 : : "can't determine vectorization factor.\n");
2847 : 56141 : return ok;
2848 : : }
2849 : :
2850 : : /* Compute the scalar iteration cost. */
2851 : 221199 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2852 : :
2853 : 221199 : poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2854 : 221199 : bool saved_can_use_partial_vectors_p
2855 : : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2856 : :
2857 : : /* This is the point where we can re-start analysis with single-lane
2858 : : SLP forced. */
2859 : 350061 : start_over:
2860 : :
2861 : : /* Check the SLP opportunities in the loop, analyze and build
2862 : : SLP trees. */
2863 : 700122 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2864 : : slp == 1);
2865 : 350061 : if (!ok)
2866 : 0 : return ok;
2867 : :
2868 : : /* If there are any SLP instances mark them as pure_slp. */
2869 : 350061 : if (vect_make_slp_decision (loop_vinfo))
2870 : : {
2871 : : /* Find stmts that need to be both vectorized and SLPed. */
2872 : 344892 : vect_detect_hybrid_slp (loop_vinfo);
2873 : :
2874 : : /* Update the vectorization factor based on the SLP decision. */
2875 : 344892 : vect_update_vf_for_slp (loop_vinfo);
2876 : :
2877 : : /* Optimize the SLP graph with the vectorization factor fixed. */
2878 : 344892 : vect_optimize_slp (loop_vinfo);
2879 : :
2880 : : /* Gather the loads reachable from the SLP graph entries. */
2881 : 344892 : vect_gather_slp_loads (loop_vinfo);
2882 : : }
2883 : :
2884 : : /* We don't expect to have to roll back to anything other than an empty
2885 : : set of rgroups. */
2886 : 350061 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2887 : :
2888 : : /* Apply the suggested unrolling factor, this was determined by the backend
2889 : : during finish_cost the first time we ran the analyzis for this
2890 : : vector mode. */
2891 : 350061 : if (applying_suggested_uf)
2892 : 28 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2893 : :
2894 : : /* Now the vectorization factor is final. */
2895 : 350061 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2896 : 350061 : gcc_assert (known_ne (vectorization_factor, 0U));
2897 : :
2898 : 350061 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2899 : : {
2900 : 13915 : dump_printf_loc (MSG_NOTE, vect_location,
2901 : : "vectorization_factor = ");
2902 : 13915 : dump_dec (MSG_NOTE, vectorization_factor);
2903 : 13915 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2904 : 13915 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2905 : : }
2906 : :
2907 : 350061 : if (max_vf != MAX_VECTORIZATION_FACTOR
2908 : 350061 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2909 : 1 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2910 : :
2911 : 350060 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2912 : :
2913 : : /* Analyze the alignment of the data-refs in the loop.
2914 : : Fail if a data reference is found that cannot be vectorized. */
2915 : :
2916 : 350060 : ok = vect_analyze_data_refs_alignment (loop_vinfo);
2917 : 350060 : if (!ok)
2918 : : {
2919 : 0 : if (dump_enabled_p ())
2920 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2921 : : "bad data alignment.\n");
2922 : 0 : return ok;
2923 : : }
2924 : :
2925 : : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2926 : : It is important to call pruning after vect_analyze_data_ref_accesses,
2927 : : since we use grouping information gathered by interleaving analysis. */
2928 : 350060 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2929 : 350060 : if (!ok)
2930 : 16088 : return ok;
2931 : :
2932 : : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2933 : : vectorization, since we do not want to add extra peeling or
2934 : : add versioning for alignment. */
2935 : 333972 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2936 : : /* This pass will decide on using loop versioning and/or loop peeling in
2937 : : order to enhance the alignment of data references in the loop. */
2938 : 319388 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2939 : 333972 : if (!ok)
2940 : 0 : return ok;
2941 : :
2942 : : /* Analyze operations in the SLP instances. We can't simply
2943 : : remove unsupported SLP instances as this makes the above
2944 : : SLP kind detection invalid and might also affect the VF. */
2945 : 333972 : if (! vect_slp_analyze_operations (loop_vinfo))
2946 : : {
2947 : 227582 : ok = opt_result::failure_at (vect_location,
2948 : : "unsupported SLP instances\n");
2949 : 227582 : goto again;
2950 : : }
2951 : :
2952 : : /* Dissolve SLP-only groups. */
2953 : 106390 : vect_dissolve_slp_only_groups (loop_vinfo);
2954 : :
2955 : : /* Scan all the remaining operations in the loop that we did not catch
2956 : : during SLP build and make sure we fail. */
2957 : 106390 : ok = vect_analyze_loop_operations (loop_vinfo);
2958 : 106390 : if (!ok)
2959 : : {
2960 : 1396 : ok = opt_result::failure_at (vect_location,
2961 : : "bad operation or unsupported loop bound\n");
2962 : 1396 : goto again;
2963 : : }
2964 : :
2965 : : /* For now, we don't expect to mix both masking and length approaches for one
2966 : : loop, disable it if both are recorded. */
2967 : 104994 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2968 : 23 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2969 : 105017 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2970 : : {
2971 : 0 : if (dump_enabled_p ())
2972 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2973 : : "can't vectorize a loop with partial vectors"
2974 : : " because we don't expect to mix different"
2975 : : " approaches with partial vectors for the"
2976 : : " same loop.\n");
2977 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2978 : : }
2979 : :
2980 : : /* If we still have the option of using partial vectors,
2981 : : check whether we can generate the necessary loop controls. */
2982 : 104994 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2983 : : {
2984 : 23 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2985 : : {
2986 : 23 : if (!vect_verify_full_masking (loop_vinfo)
2987 : 23 : && !vect_verify_full_masking_avx512 (loop_vinfo))
2988 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2989 : : }
2990 : : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2991 : 0 : if (!vect_verify_loop_lens (loop_vinfo))
2992 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2993 : : }
2994 : :
2995 : : /* If we're vectorizing a loop that uses length "controls" and
2996 : : can iterate more than once, we apply decrementing IV approach
2997 : : in loop control. */
2998 : 104994 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2999 : 23 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3000 : 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3001 : 104994 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3002 : 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3003 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3004 : 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3005 : :
3006 : : /* If a loop uses length controls and has a decrementing loop control IV,
3007 : : we will normally pass that IV through a MIN_EXPR to calcaluate the
3008 : : basis for the length controls. E.g. in a loop that processes one
3009 : : element per scalar iteration, the number of elements would be
3010 : : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3011 : :
3012 : : This MIN_EXPR approach allows us to use pointer IVs with an invariant
3013 : : step, since only the final iteration of the vector loop can have
3014 : : inactive lanes.
3015 : :
3016 : : However, some targets have a dedicated instruction for calculating the
3017 : : preferred length, given the total number of elements that still need to
3018 : : be processed. This is encapsulated in the SELECT_VL internal function.
3019 : :
3020 : : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3021 : : to determine the basis for the length controls. However, unlike the
3022 : : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3023 : : lanes inactive in any iteration of the vector loop, not just the last
3024 : : iteration. This SELECT_VL approach therefore requires us to use pointer
3025 : : IVs with variable steps.
3026 : :
3027 : : Once we've decided how many elements should be processed by one
3028 : : iteration of the vector loop, we need to populate the rgroup controls.
3029 : : If a loop has multiple rgroups, we need to make sure that those rgroups
3030 : : "line up" (that is, they must be consistent about which elements are
3031 : : active and which aren't). This is done by vect_adjust_loop_lens_control.
3032 : :
3033 : : In principle, it would be possible to use vect_adjust_loop_lens_control
3034 : : on either the result of a MIN_EXPR or the result of a SELECT_VL.
3035 : : However:
3036 : :
3037 : : (1) In practice, it only makes sense to use SELECT_VL when a vector
3038 : : operation will be controlled directly by the result. It is not
3039 : : worth using SELECT_VL if it would only be the input to other
3040 : : calculations.
3041 : :
3042 : : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3043 : : pointer IV will need N updates by a variable amount (N-1 updates
3044 : : within the iteration and 1 update to move to the next iteration).
3045 : :
3046 : : Because of this, we prefer to use the MIN_EXPR approach whenever there
3047 : : is more than one length control.
3048 : :
3049 : : In addition, SELECT_VL always operates to a granularity of 1 unit.
3050 : : If we wanted to use it to control an SLP operation on N consecutive
3051 : : elements, we would need to make the SELECT_VL inputs measure scalar
3052 : : iterations (rather than elements) and then multiply the SELECT_VL
3053 : : result by N. But using SELECT_VL this way is inefficient because
3054 : : of (1) above.
3055 : :
3056 : : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3057 : : satisfied:
3058 : :
3059 : : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3060 : : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3061 : :
3062 : : Since SELECT_VL (variable step) will make SCEV analysis failed and then
3063 : : we will fail to gain benefits of following unroll optimizations. We prefer
3064 : : using the MIN_EXPR approach in this situation. */
3065 : 104994 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3066 : : {
3067 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3068 : 0 : if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3069 : : OPTIMIZE_FOR_SPEED)
3070 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3071 : 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
3072 : 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3073 : : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3074 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3075 : :
3076 : : /* If any of the SLP instances cover more than a single lane
3077 : : we cannot use .SELECT_VL at the moment, even if the number
3078 : : of lanes is uniform throughout the SLP graph. */
3079 : 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
3080 : 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
3081 : 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
3082 : 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
3083 : 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
3084 : : {
3085 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
3086 : 0 : break;
3087 : : }
3088 : : }
3089 : :
3090 : : /* Decide whether this loop_vinfo should use partial vectors or peeling,
3091 : : assuming that the loop will be used as a main loop. We will redo
3092 : : this analysis later if we instead decide to use the loop as an
3093 : : epilogue loop. */
3094 : 104994 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3095 : 104994 : if (!ok)
3096 : 0 : return ok;
3097 : :
3098 : : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3099 : : to be able to handle fewer than VF scalars, or needs to have a lower VF
3100 : : than the main loop. */
3101 : 104994 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3102 : 11778 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3103 : : {
3104 : 11776 : poly_uint64 unscaled_vf
3105 : 11776 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3106 : : orig_loop_vinfo->suggested_unroll_factor);
3107 : 11776 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3108 : 105 : return opt_result::failure_at (vect_location,
3109 : : "Vectorization factor too high for"
3110 : : " epilogue loop.\n");
3111 : : }
3112 : :
3113 : : /* If the epilogue needs peeling for gaps but the main loop doesn't give
3114 : : up on the epilogue. */
3115 : 104889 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3116 : 11673 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3117 : 58 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
3118 : : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
3119 : 4 : return opt_result::failure_at (vect_location,
3120 : : "Epilogue loop requires peeling for gaps "
3121 : : "but main loop does not.\n");
3122 : :
3123 : : /* If an epilogue loop is required make sure we can create one. */
3124 : 104885 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3125 : 103687 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3126 : 31346 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3127 : : {
3128 : 74570 : if (dump_enabled_p ())
3129 : 4843 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3130 : 74570 : if (!vect_can_advance_ivs_p (loop_vinfo)
3131 : 148673 : || !slpeel_can_duplicate_loop_p (loop,
3132 : : LOOP_VINFO_IV_EXIT (loop_vinfo),
3133 : 74103 : LOOP_VINFO_IV_EXIT (loop_vinfo)))
3134 : : {
3135 : 467 : ok = opt_result::failure_at (vect_location,
3136 : : "not vectorized: can't create required "
3137 : : "epilog loop\n");
3138 : 467 : goto again;
3139 : : }
3140 : : }
3141 : :
3142 : : /* Check the costings of the loop make vectorizing worthwhile. */
3143 : 104418 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3144 : 104418 : if (res < 0)
3145 : : {
3146 : 29192 : ok = opt_result::failure_at (vect_location,
3147 : : "Loop costings may not be worthwhile.\n");
3148 : 29192 : goto again;
3149 : : }
3150 : 75226 : if (!res)
3151 : 18321 : return opt_result::failure_at (vect_location,
3152 : : "Loop costings not worthwhile.\n");
3153 : :
3154 : : /* During peeling, we need to check if number of loop iterations is
3155 : : enough for both peeled prolog loop and vector loop. This check
3156 : : can be merged along with threshold check of loop versioning, so
3157 : : increase threshold for this case if necessary.
3158 : :
3159 : : If we are analyzing an epilogue we still want to check what its
3160 : : versioning threshold would be. If we decide to vectorize the epilogues we
3161 : : will want to use the lowest versioning threshold of all epilogues and main
3162 : : loop. This will enable us to enter a vectorized epilogue even when
3163 : : versioning the loop. We can't simply check whether the epilogue requires
3164 : : versioning though since we may have skipped some versioning checks when
3165 : : analyzing the epilogue. For instance, checks for alias versioning will be
3166 : : skipped when dealing with epilogues as we assume we already checked them
3167 : : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3168 : 56905 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3169 : : {
3170 : 5481 : poly_uint64 niters_th = 0;
3171 : 5481 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3172 : :
3173 : 5481 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3174 : : {
3175 : : /* Niters for peeled prolog loop. */
3176 : 5481 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3177 : : {
3178 : 119 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3179 : 119 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3180 : 119 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3181 : : }
3182 : : else
3183 : 5362 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3184 : : }
3185 : :
3186 : : /* Niters for at least one iteration of vectorized loop. */
3187 : 5481 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3188 : 5481 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3189 : : /* One additional iteration because of peeling for gap. */
3190 : 5481 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3191 : 55 : niters_th += 1;
3192 : :
3193 : : /* Use the same condition as vect_transform_loop to decide when to use
3194 : : the cost to determine a versioning threshold. */
3195 : 5481 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3196 : 5481 : && ordered_p (th, niters_th))
3197 : 3722 : niters_th = ordered_max (poly_uint64 (th), niters_th);
3198 : :
3199 : 5481 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3200 : : }
3201 : :
3202 : 56905 : gcc_assert (known_eq (vectorization_factor,
3203 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3204 : :
3205 : 56905 : slp_done_for_suggested_uf = slp;
3206 : :
3207 : : /* Ok to vectorize! */
3208 : 56905 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3209 : 56905 : return opt_result::success ();
3210 : :
3211 : 258637 : again:
3212 : : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3213 : 258637 : gcc_assert (!ok);
3214 : :
3215 : : /* Try again with single-lane SLP. */
3216 : 258637 : if (slp == 1)
3217 : 128381 : return ok;
3218 : :
3219 : : /* If we are applying suggested unroll factor, we don't need to
3220 : : re-try any more as we want to keep the SLP mode fixed. */
3221 : 130256 : if (applying_suggested_uf)
3222 : 4 : return ok;
3223 : :
3224 : : /* If there are reduction chains re-trying will fail anyway. */
3225 : 130252 : if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3226 : 205 : return ok;
3227 : :
3228 : : /* Likewise if the grouped loads or stores in the SLP cannot be handled
3229 : : via interleaving or lane instructions. */
3230 : : slp_instance instance;
3231 : : slp_tree node;
3232 : : unsigned i, j;
3233 : 490740 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3234 : : {
3235 : 361878 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
3236 : 0 : continue;
3237 : :
3238 : 361878 : stmt_vec_info vinfo;
3239 : 361878 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3240 : 361878 : if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3241 : 359093 : continue;
3242 : 2785 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3243 : 2785 : unsigned int size = DR_GROUP_SIZE (vinfo);
3244 : 2785 : tree vectype = STMT_VINFO_VECTYPE (vinfo);
3245 : 2785 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3246 : 4749 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3247 : 5435 : && ! vect_grouped_store_supported (vectype, size))
3248 : 686 : return opt_result::failure_at (vinfo->stmt,
3249 : : "unsupported grouped store\n");
3250 : 364254 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3251 : : {
3252 : 2303 : vinfo = SLP_TREE_REPRESENTATIVE (node);
3253 : 2303 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3254 : : {
3255 : 1972 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3256 : 1972 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3257 : 1972 : size = DR_GROUP_SIZE (vinfo);
3258 : 1972 : vectype = STMT_VINFO_VECTYPE (vinfo);
3259 : 1972 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3260 : 1972 : && ! vect_grouped_load_supported (vectype, single_element_p,
3261 : : size))
3262 : 499 : return opt_result::failure_at (vinfo->stmt,
3263 : : "unsupported grouped load\n");
3264 : : }
3265 : : }
3266 : : }
3267 : :
3268 : : /* Roll back state appropriately. Force single-lane SLP this time. */
3269 : 128862 : slp = 1;
3270 : 128862 : if (dump_enabled_p ())
3271 : 3682 : dump_printf_loc (MSG_NOTE, vect_location,
3272 : : "re-trying with single-lane SLP\n");
3273 : :
3274 : : /* Restore vectorization factor as it were without SLP. */
3275 : 128862 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3276 : : /* Free the SLP instances. */
3277 : 489548 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3278 : 360686 : vect_free_slp_instance (instance);
3279 : 128862 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3280 : : /* Reset SLP type to loop_vect on all stmts. */
3281 : 491803 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3282 : : {
3283 : 362941 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3284 : 362941 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
3285 : 664174 : !gsi_end_p (si); gsi_next (&si))
3286 : : {
3287 : 301233 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3288 : 301233 : STMT_SLP_TYPE (stmt_info) = loop_vect;
3289 : 301233 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3290 : 301233 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3291 : : {
3292 : : /* vectorizable_reduction adjusts reduction stmt def-types,
3293 : : restore them to that of the PHI. */
3294 : 18098 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3295 : 18098 : = STMT_VINFO_DEF_TYPE (stmt_info);
3296 : 18098 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3297 : : (STMT_VINFO_REDUC_DEF (stmt_info)))
3298 : 18098 : = STMT_VINFO_DEF_TYPE (stmt_info);
3299 : : }
3300 : : }
3301 : 725882 : for (gimple_stmt_iterator si = gsi_start_bb (bb);
3302 : 2206041 : !gsi_end_p (si); gsi_next (&si))
3303 : : {
3304 : 1843100 : if (is_gimple_debug (gsi_stmt (si)))
3305 : 651994 : continue;
3306 : 1191106 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3307 : 1191106 : STMT_SLP_TYPE (stmt_info) = loop_vect;
3308 : 1191106 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3309 : : {
3310 : 216291 : stmt_vec_info pattern_stmt_info
3311 : : = STMT_VINFO_RELATED_STMT (stmt_info);
3312 : 216291 : if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3313 : 0 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3314 : :
3315 : 216291 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3316 : 216291 : STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3317 : 216291 : for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3318 : 440356 : !gsi_end_p (pi); gsi_next (&pi))
3319 : 224065 : STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3320 : 224065 : = loop_vect;
3321 : : }
3322 : : }
3323 : : }
3324 : : /* Free optimized alias test DDRS. */
3325 : 128862 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3326 : 128862 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3327 : 128862 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3328 : : /* Reset target cost data. */
3329 : 128862 : delete loop_vinfo->vector_costs;
3330 : 128862 : loop_vinfo->vector_costs = nullptr;
3331 : : /* Reset accumulated rgroup information. */
3332 : 128862 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3333 : 128862 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3334 : 128862 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3335 : : /* Reset assorted flags. */
3336 : 128862 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3337 : 128862 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3338 : 128862 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3339 : 128862 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3340 : 128862 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3341 : 128862 : = saved_can_use_partial_vectors_p;
3342 : 128862 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3343 : 128862 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
3344 : 128862 : if (loop_vinfo->scan_map)
3345 : 122 : loop_vinfo->scan_map->empty ();
3346 : :
3347 : 128862 : goto start_over;
3348 : : }
3349 : :
3350 : : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3351 : : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3352 : : OLD_LOOP_VINFO is better unless something specifically indicates
3353 : : otherwise.
3354 : :
3355 : : Note that this deliberately isn't a partial order. */
3356 : :
3357 : : static bool
3358 : 0 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3359 : : loop_vec_info old_loop_vinfo)
3360 : : {
3361 : 0 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3362 : 0 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3363 : :
3364 : 0 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3365 : 0 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3366 : :
3367 : : /* Always prefer a VF of loop->simdlen over any other VF. */
3368 : 0 : if (loop->simdlen)
3369 : : {
3370 : 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3371 : 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3372 : 0 : if (new_simdlen_p != old_simdlen_p)
3373 : : return new_simdlen_p;
3374 : : }
3375 : :
3376 : 0 : const auto *old_costs = old_loop_vinfo->vector_costs;
3377 : 0 : const auto *new_costs = new_loop_vinfo->vector_costs;
3378 : 0 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3379 : 0 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3380 : :
3381 : 0 : return new_costs->better_main_loop_than_p (old_costs);
3382 : : }
3383 : :
3384 : : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3385 : : true if we should. */
3386 : :
3387 : : static bool
3388 : 0 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3389 : : loop_vec_info old_loop_vinfo)
3390 : : {
3391 : 0 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3392 : : return false;
3393 : :
3394 : 0 : if (dump_enabled_p ())
3395 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3396 : : "***** Preferring vector mode %s to vector mode %s\n",
3397 : 0 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
3398 : 0 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
3399 : : return true;
3400 : : }
3401 : :
3402 : : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
3403 : : not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3404 : : MODE_I to the next mode useful to analyze.
3405 : : Return the loop_vinfo on success and wrapped null on failure. */
3406 : :
3407 : : static opt_loop_vec_info
3408 : 407048 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3409 : : const vect_loop_form_info *loop_form_info,
3410 : : loop_vec_info orig_loop_vinfo,
3411 : : const vector_modes &vector_modes, unsigned &mode_i,
3412 : : machine_mode &autodetected_vector_mode,
3413 : : bool &fatal)
3414 : : {
3415 : 407048 : loop_vec_info loop_vinfo
3416 : 407048 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
3417 : :
3418 : 407048 : machine_mode vector_mode = vector_modes[mode_i];
3419 : 407048 : loop_vinfo->vector_mode = vector_mode;
3420 : 407048 : unsigned int suggested_unroll_factor = 1;
3421 : 407048 : unsigned slp_done_for_suggested_uf = 0;
3422 : :
3423 : : /* Run the main analysis. */
3424 : 407048 : opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3425 : : &suggested_unroll_factor,
3426 : : slp_done_for_suggested_uf);
3427 : 407048 : if (dump_enabled_p ())
3428 : 18802 : dump_printf_loc (MSG_NOTE, vect_location,
3429 : : "***** Analysis %s with vector mode %s\n",
3430 : 18802 : res ? "succeeded" : "failed",
3431 : 18802 : GET_MODE_NAME (loop_vinfo->vector_mode));
3432 : :
3433 : 407048 : auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
3434 : 407048 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3435 : : /* Check to see if the user wants to unroll or if the target wants to. */
3436 : 457414 : && (suggested_unroll_factor > 1 || user_unroll > 1))
3437 : : {
3438 : 40 : if (suggested_unroll_factor == 1)
3439 : : {
3440 : 40 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
3441 : 40 : suggested_unroll_factor = user_unroll / assumed_vf;
3442 : 40 : if (suggested_unroll_factor > 1)
3443 : : {
3444 : 28 : if (dump_enabled_p ())
3445 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
3446 : : "setting unroll factor to %d based on user requested "
3447 : : "unroll factor %d and suggested vectorization "
3448 : : "factor: %d\n",
3449 : : suggested_unroll_factor, user_unroll, assumed_vf);
3450 : : }
3451 : : }
3452 : :
3453 : 40 : if (suggested_unroll_factor > 1)
3454 : : {
3455 : 28 : if (dump_enabled_p ())
3456 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
3457 : : "***** Re-trying analysis for unrolling"
3458 : : " with unroll factor %d and slp %s.\n",
3459 : : suggested_unroll_factor,
3460 : 20 : slp_done_for_suggested_uf ? "on" : "off");
3461 : 28 : loop_vec_info unroll_vinfo
3462 : 28 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
3463 : 28 : unroll_vinfo->vector_mode = vector_mode;
3464 : 28 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3465 : 28 : opt_result new_res
3466 : 28 : = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3467 : : slp_done_for_suggested_uf);
3468 : 28 : if (new_res)
3469 : : {
3470 : 22 : delete loop_vinfo;
3471 : 22 : loop_vinfo = unroll_vinfo;
3472 : 22 : LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
3473 : : }
3474 : : else
3475 : 6 : delete unroll_vinfo;
3476 : : }
3477 : : }
3478 : :
3479 : : /* Remember the autodetected vector mode. */
3480 : 407048 : if (vector_mode == VOIDmode)
3481 : 208864 : autodetected_vector_mode = loop_vinfo->vector_mode;
3482 : :
3483 : : /* Advance mode_i, first skipping modes that would result in the
3484 : : same analysis result. */
3485 : 1848784 : while (mode_i + 1 < vector_modes.length ()
3486 : 1294455 : && vect_chooses_same_modes_p (loop_vinfo,
3487 : 573587 : vector_modes[mode_i + 1]))
3488 : : {
3489 : 313820 : if (dump_enabled_p ())
3490 : 15489 : dump_printf_loc (MSG_NOTE, vect_location,
3491 : : "***** The result for vector mode %s would"
3492 : : " be the same\n",
3493 : 15489 : GET_MODE_NAME (vector_modes[mode_i + 1]));
3494 : 313820 : mode_i += 1;
3495 : : }
3496 : 407048 : if (mode_i + 1 < vector_modes.length ()
3497 : 666815 : && vect_chooses_same_modes_p (autodetected_vector_mode,
3498 : 259767 : vector_modes[mode_i + 1]))
3499 : : {
3500 : 341 : if (dump_enabled_p ())
3501 : 6 : dump_printf_loc (MSG_NOTE, vect_location,
3502 : : "***** Skipping vector mode %s, which would"
3503 : : " repeat the analysis for %s\n",
3504 : 6 : GET_MODE_NAME (vector_modes[mode_i + 1]),
3505 : 6 : GET_MODE_NAME (autodetected_vector_mode));
3506 : 341 : mode_i += 1;
3507 : : }
3508 : 407048 : mode_i++;
3509 : :
3510 : 407048 : if (!res)
3511 : : {
3512 : 350165 : delete loop_vinfo;
3513 : 350165 : if (fatal)
3514 : 64447 : gcc_checking_assert (orig_loop_vinfo == NULL);
3515 : 350165 : return opt_loop_vec_info::propagate_failure (res);
3516 : : }
3517 : :
3518 : 56883 : return opt_loop_vec_info::success (loop_vinfo);
3519 : : }
3520 : :
3521 : : /* Function vect_analyze_loop.
3522 : :
3523 : : Apply a set of analyses on LOOP, and create a loop_vec_info struct
3524 : : for it. The different analyses will record information in the
3525 : : loop_vec_info struct. */
3526 : : opt_loop_vec_info
3527 : 466832 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
3528 : : vec_info_shared *shared)
3529 : : {
3530 : 466832 : DUMP_VECT_SCOPE ("analyze_loop_nest");
3531 : :
3532 : 466832 : if (loop_outer (loop)
3533 : 466832 : && loop_vec_info_for_loop (loop_outer (loop))
3534 : 467245 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3535 : 413 : return opt_loop_vec_info::failure_at (vect_location,
3536 : : "outer-loop already vectorized.\n");
3537 : :
3538 : 466419 : if (!find_loop_nest (loop, &shared->loop_nest))
3539 : 21936 : return opt_loop_vec_info::failure_at
3540 : 21936 : (vect_location,
3541 : : "not vectorized: loop nest containing two or more consecutive inner"
3542 : : " loops cannot be vectorized\n");
3543 : :
3544 : : /* Analyze the loop form. */
3545 : 444483 : vect_loop_form_info loop_form_info;
3546 : 444483 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
3547 : : &loop_form_info);
3548 : 444483 : if (!res)
3549 : : {
3550 : 235619 : if (dump_enabled_p ())
3551 : 1622 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3552 : : "bad loop form.\n");
3553 : 235619 : return opt_loop_vec_info::propagate_failure (res);
3554 : : }
3555 : 208864 : if (!integer_onep (loop_form_info.assumptions))
3556 : : {
3557 : : /* We consider to vectorize this loop by versioning it under
3558 : : some assumptions. In order to do this, we need to clear
3559 : : existing information computed by scev and niter analyzer. */
3560 : 10114 : scev_reset_htab ();
3561 : 10114 : free_numbers_of_iterations_estimates (loop);
3562 : : /* Also set flag for this loop so that following scev and niter
3563 : : analysis are done under the assumptions. */
3564 : 10114 : loop_constraint_set (loop, LOOP_C_FINITE);
3565 : : }
3566 : : else
3567 : : /* Clear the existing niter information to make sure the nonwrapping flag
3568 : : will be calculated and set propriately. */
3569 : 198750 : free_numbers_of_iterations_estimates (loop);
3570 : :
3571 : 208864 : auto_vector_modes vector_modes;
3572 : : /* Autodetect first vector size we try. */
3573 : 208864 : vector_modes.safe_push (VOIDmode);
3574 : 208864 : unsigned int autovec_flags
3575 : 417728 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3576 : 208864 : loop->simdlen != 0);
3577 : 208864 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3578 : 208864 : && !unlimited_cost_model (loop));
3579 : 208864 : machine_mode autodetected_vector_mode = VOIDmode;
3580 : 208864 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3581 : 208864 : unsigned int mode_i = 0;
3582 : 208864 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3583 : :
3584 : : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3585 : : a mode has not been analyzed. */
3586 : 208864 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
3587 : 2104794 : for (unsigned i = 0; i < vector_modes.length (); ++i)
3588 : 843533 : cached_vf_per_mode.safe_push (0);
3589 : :
3590 : : /* First determine the main loop vectorization mode, either the first
3591 : : one that works, starting with auto-detecting the vector mode and then
3592 : : following the targets order of preference, or the one with the
3593 : : lowest cost if pick_lowest_cost_p. */
3594 : 571398 : while (1)
3595 : : {
3596 : 390131 : bool fatal;
3597 : 390131 : unsigned int last_mode_i = mode_i;
3598 : : /* Set cached VF to -1 prior to analysis, which indicates a mode has
3599 : : failed. */
3600 : 390131 : cached_vf_per_mode[last_mode_i] = -1;
3601 : 390131 : opt_loop_vec_info loop_vinfo
3602 : 390131 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3603 : : NULL, vector_modes, mode_i,
3604 : : autodetected_vector_mode, fatal);
3605 : 390131 : if (fatal)
3606 : : break;
3607 : :
3608 : 325684 : if (loop_vinfo)
3609 : : {
3610 : : /* Analyzis has been successful so update the VF value. The
3611 : : VF should always be a multiple of unroll_factor and we want to
3612 : : capture the original VF here. */
3613 : 50366 : cached_vf_per_mode[last_mode_i]
3614 : 50366 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3615 : 50366 : loop_vinfo->suggested_unroll_factor);
3616 : : /* Once we hit the desired simdlen for the first time,
3617 : : discard any previous attempts. */
3618 : 50366 : if (simdlen
3619 : 50366 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3620 : : {
3621 : 47 : delete first_loop_vinfo;
3622 : : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3623 : : simdlen = 0;
3624 : : }
3625 : 50319 : else if (pick_lowest_cost_p
3626 : 0 : && first_loop_vinfo
3627 : 50319 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3628 : : {
3629 : : /* Pick loop_vinfo over first_loop_vinfo. */
3630 : 0 : delete first_loop_vinfo;
3631 : 0 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3632 : : }
3633 : 50366 : if (first_loop_vinfo == NULL)
3634 : : first_loop_vinfo = loop_vinfo;
3635 : : else
3636 : : {
3637 : 2 : delete loop_vinfo;
3638 : 2 : loop_vinfo = opt_loop_vec_info::success (NULL);
3639 : : }
3640 : :
3641 : : /* Commit to first_loop_vinfo if we have no reason to try
3642 : : alternatives. */
3643 : 50366 : if (!simdlen && !pick_lowest_cost_p)
3644 : : break;
3645 : : }
3646 : 275327 : if (mode_i == vector_modes.length ()
3647 : 275327 : || autodetected_vector_mode == VOIDmode)
3648 : : break;
3649 : :
3650 : : /* Try the next biggest vector size. */
3651 : 181267 : if (dump_enabled_p ())
3652 : 3757 : dump_printf_loc (MSG_NOTE, vect_location,
3653 : : "***** Re-trying analysis with vector mode %s\n",
3654 : 3757 : GET_MODE_NAME (vector_modes[mode_i]));
3655 : 181267 : }
3656 : 208864 : if (!first_loop_vinfo)
3657 : 158505 : return opt_loop_vec_info::propagate_failure (res);
3658 : :
3659 : 50359 : if (dump_enabled_p ())
3660 : 8753 : dump_printf_loc (MSG_NOTE, vect_location,
3661 : : "***** Choosing vector mode %s\n",
3662 : 8753 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3663 : :
3664 : : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3665 : : enabled, SIMDUID is not set, it is the innermost loop and we have
3666 : : either already found the loop's SIMDLEN or there was no SIMDLEN to
3667 : : begin with.
3668 : : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3669 : 50359 : bool vect_epilogues = (!simdlen
3670 : 50357 : && loop->inner == NULL
3671 : 49930 : && param_vect_epilogues_nomask
3672 : 48895 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3673 : : /* No code motion support for multiple epilogues so for now
3674 : : not supported when multiple exits. */
3675 : 24300 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3676 : 23894 : && !loop->simduid
3677 : 72843 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3678 : 50359 : if (!vect_epilogues)
3679 : 38227 : return first_loop_vinfo;
3680 : :
3681 : : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3682 : :
3683 : : /* For epilogues start the analysis from the first mode. The motivation
3684 : : behind starting from the beginning comes from cases where the VECTOR_MODES
3685 : : array may contain length-agnostic and length-specific modes. Their
3686 : : ordering is not guaranteed, so we could end up picking a mode for the main
3687 : : loop that is after the epilogue's optimal mode. */
3688 : 12132 : if (!unlimited_cost_model (loop)
3689 : 12132 : && first_loop_vinfo->vector_costs->suggested_epilogue_mode () != VOIDmode)
3690 : : {
3691 : 0 : vector_modes[0]
3692 : 0 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode ();
3693 : 0 : cached_vf_per_mode[0] = 0;
3694 : : }
3695 : : else
3696 : 12132 : vector_modes[0] = autodetected_vector_mode;
3697 : 12132 : mode_i = 0;
3698 : :
3699 : 12132 : bool supports_partial_vectors = param_vect_partial_vector_usage != 0;
3700 : 12132 : machine_mode mask_mode;
3701 : 12132 : if (supports_partial_vectors
3702 : 14 : && !partial_vectors_supported_p ()
3703 : 12146 : && !(VECTOR_MODE_P (first_loop_vinfo->vector_mode)
3704 : 14 : && targetm.vectorize.get_mask_mode
3705 : 12136 : (first_loop_vinfo->vector_mode).exists (&mask_mode)
3706 : 14 : && SCALAR_INT_MODE_P (mask_mode)))
3707 : 10 : supports_partial_vectors = false;
3708 : 12132 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3709 : :
3710 : 12132 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3711 : 42119 : do
3712 : : {
3713 : 42040 : while (1)
3714 : : {
3715 : : /* If the target does not support partial vectors we can shorten the
3716 : : number of modes to analyze for the epilogue as we know we can't
3717 : : pick a mode that would lead to a VF at least as big as the
3718 : : FIRST_VINFO_VF. */
3719 : 55099 : if (!supports_partial_vectors
3720 : 42040 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3721 : : {
3722 : 13081 : mode_i++;
3723 : 26162 : if (mode_i == vector_modes.length ())
3724 : : break;
3725 : 25101 : continue;
3726 : : }
3727 : : /* We would need an exhaustive search to find all modes we
3728 : : skipped but that would lead to the same result as the
3729 : : analysis it was skipped for and where we'd could check
3730 : : cached_vf_per_mode against.
3731 : : Check for the autodetected mode, which is the common
3732 : : situation on x86 which does not perform cost comparison. */
3733 : 41001 : if (!supports_partial_vectors
3734 : 28953 : && maybe_ge (cached_vf_per_mode[0], first_vinfo_vf)
3735 : 57703 : && vect_chooses_same_modes_p (autodetected_vector_mode,
3736 : 28744 : vector_modes[mode_i]))
3737 : : {
3738 : 12042 : mode_i++;
3739 : 24084 : if (mode_i == vector_modes.length ())
3740 : : break;
3741 : 12042 : continue;
3742 : : }
3743 : :
3744 : 16917 : if (dump_enabled_p ())
3745 : 2981 : dump_printf_loc (MSG_NOTE, vect_location,
3746 : : "***** Re-trying epilogue analysis with vector "
3747 : 2981 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3748 : :
3749 : 16917 : bool fatal;
3750 : 16917 : opt_loop_vec_info loop_vinfo
3751 : 16917 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3752 : : orig_loop_vinfo,
3753 : : vector_modes, mode_i,
3754 : : autodetected_vector_mode, fatal);
3755 : 16917 : if (fatal)
3756 : : break;
3757 : :
3758 : 16917 : if (loop_vinfo)
3759 : : {
3760 : 6517 : if (pick_lowest_cost_p
3761 : 0 : && orig_loop_vinfo->epilogue_vinfo
3762 : 6517 : && vect_joust_loop_vinfos (loop_vinfo,
3763 : 0 : orig_loop_vinfo->epilogue_vinfo))
3764 : : {
3765 : 0 : gcc_assert (vect_epilogues);
3766 : 0 : delete orig_loop_vinfo->epilogue_vinfo;
3767 : 0 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3768 : : }
3769 : 6517 : if (!orig_loop_vinfo->epilogue_vinfo)
3770 : 6517 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3771 : : else
3772 : : {
3773 : 0 : delete loop_vinfo;
3774 : 0 : loop_vinfo = opt_loop_vec_info::success (NULL);
3775 : : }
3776 : :
3777 : : /* For now only allow one epilogue loop, but allow
3778 : : pick_lowest_cost_p to replace it, so commit to the
3779 : : first epilogue if we have no reason to try alternatives. */
3780 : 6517 : if (!pick_lowest_cost_p)
3781 : : break;
3782 : : }
3783 : :
3784 : 20800 : if (mode_i == vector_modes.length ())
3785 : : break;
3786 : : }
3787 : :
3788 : 12211 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3789 : 12211 : if (!orig_loop_vinfo)
3790 : : break;
3791 : :
3792 : : /* When we selected a first vectorized epilogue, see if the target
3793 : : suggests to have another one. */
3794 : 6517 : if (!unlimited_cost_model (loop)
3795 : 3745 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (orig_loop_vinfo)
3796 : 10260 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode ()
3797 : : != VOIDmode))
3798 : : {
3799 : 158 : vector_modes[0]
3800 : 79 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode ();
3801 : 79 : cached_vf_per_mode[0] = 0;
3802 : 79 : mode_i = 0;
3803 : : }
3804 : : else
3805 : : break;
3806 : 79 : }
3807 : : while (1);
3808 : :
3809 : 12132 : if (first_loop_vinfo->epilogue_vinfo)
3810 : : {
3811 : 6440 : poly_uint64 lowest_th
3812 : 6440 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3813 : 6440 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3814 : 6517 : do
3815 : : {
3816 : 6517 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3817 : 6517 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3818 : : || maybe_ne (lowest_th, 0U));
3819 : : /* Keep track of the known smallest versioning threshold. */
3820 : 6517 : if (ordered_p (lowest_th, th))
3821 : 6517 : lowest_th = ordered_min (lowest_th, th);
3822 : 6517 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3823 : : }
3824 : 6517 : while (epilog_vinfo);
3825 : 6440 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3826 : 6440 : if (dump_enabled_p ())
3827 : 1275 : dump_printf_loc (MSG_NOTE, vect_location,
3828 : : "***** Choosing epilogue vector mode %s\n",
3829 : 1275 : GET_MODE_NAME
3830 : : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3831 : : }
3832 : :
3833 : 12132 : return first_loop_vinfo;
3834 : 653347 : }
3835 : :
3836 : : /* Return true if there is an in-order reduction function for CODE, storing
3837 : : it in *REDUC_FN if so. */
3838 : :
3839 : : static bool
3840 : 4915 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3841 : : {
3842 : : /* We support MINUS_EXPR by negating the operand. This also preserves an
3843 : : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3844 : : (-0.0) = -0.0. */
3845 : 4915 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3846 : : {
3847 : 4239 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3848 : 0 : return true;
3849 : : }
3850 : : return false;
3851 : : }
3852 : :
3853 : : /* Function reduction_fn_for_scalar_code
3854 : :
3855 : : Input:
3856 : : CODE - tree_code of a reduction operations.
3857 : :
3858 : : Output:
3859 : : REDUC_FN - the corresponding internal function to be used to reduce the
3860 : : vector of partial results into a single scalar result, or IFN_LAST
3861 : : if the operation is a supported reduction operation, but does not have
3862 : : such an internal function.
3863 : :
3864 : : Return FALSE if CODE currently cannot be vectorized as reduction. */
3865 : :
3866 : : bool
3867 : 1966794 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3868 : : {
3869 : 1966794 : if (code.is_tree_code ())
3870 : 1966740 : switch (tree_code (code))
3871 : : {
3872 : 14346 : case MAX_EXPR:
3873 : 14346 : *reduc_fn = IFN_REDUC_MAX;
3874 : 14346 : return true;
3875 : :
3876 : 50959 : case MIN_EXPR:
3877 : 50959 : *reduc_fn = IFN_REDUC_MIN;
3878 : 50959 : return true;
3879 : :
3880 : 1048444 : case PLUS_EXPR:
3881 : 1048444 : *reduc_fn = IFN_REDUC_PLUS;
3882 : 1048444 : return true;
3883 : :
3884 : 251069 : case BIT_AND_EXPR:
3885 : 251069 : *reduc_fn = IFN_REDUC_AND;
3886 : 251069 : return true;
3887 : :
3888 : 287753 : case BIT_IOR_EXPR:
3889 : 287753 : *reduc_fn = IFN_REDUC_IOR;
3890 : 287753 : return true;
3891 : :
3892 : 42307 : case BIT_XOR_EXPR:
3893 : 42307 : *reduc_fn = IFN_REDUC_XOR;
3894 : 42307 : return true;
3895 : :
3896 : 271862 : case MULT_EXPR:
3897 : 271862 : case MINUS_EXPR:
3898 : 271862 : *reduc_fn = IFN_LAST;
3899 : 271862 : return true;
3900 : :
3901 : : default:
3902 : : return false;
3903 : : }
3904 : : else
3905 : 54 : switch (combined_fn (code))
3906 : : {
3907 : 30 : CASE_CFN_FMAX:
3908 : 30 : *reduc_fn = IFN_REDUC_FMAX;
3909 : 30 : return true;
3910 : :
3911 : 24 : CASE_CFN_FMIN:
3912 : 24 : *reduc_fn = IFN_REDUC_FMIN;
3913 : 24 : return true;
3914 : :
3915 : : default:
3916 : : return false;
3917 : : }
3918 : : }
3919 : :
3920 : : /* If there is a neutral value X such that a reduction would not be affected
3921 : : by the introduction of additional X elements, return that X, otherwise
3922 : : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3923 : : of the scalar elements. If the reduction has just a single initial value
3924 : : then INITIAL_VALUE is that value, otherwise it is null.
3925 : : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3926 : : In that case no signed zero is returned. */
3927 : :
3928 : : tree
3929 : 73436 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3930 : : tree initial_value, bool as_initial)
3931 : : {
3932 : 73436 : if (code.is_tree_code ())
3933 : 73382 : switch (tree_code (code))
3934 : : {
3935 : 10142 : case DOT_PROD_EXPR:
3936 : 10142 : case SAD_EXPR:
3937 : 10142 : case MINUS_EXPR:
3938 : 10142 : case BIT_IOR_EXPR:
3939 : 10142 : case BIT_XOR_EXPR:
3940 : 10142 : return build_zero_cst (scalar_type);
3941 : 57625 : case WIDEN_SUM_EXPR:
3942 : 57625 : case PLUS_EXPR:
3943 : 57625 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3944 : 15 : return build_real (scalar_type, dconstm0);
3945 : : else
3946 : 57610 : return build_zero_cst (scalar_type);
3947 : :
3948 : 1912 : case MULT_EXPR:
3949 : 1912 : return build_one_cst (scalar_type);
3950 : :
3951 : 1321 : case BIT_AND_EXPR:
3952 : 1321 : return build_all_ones_cst (scalar_type);
3953 : :
3954 : : case MAX_EXPR:
3955 : : case MIN_EXPR:
3956 : : return initial_value;
3957 : :
3958 : 356 : default:
3959 : 356 : return NULL_TREE;
3960 : : }
3961 : : else
3962 : 54 : switch (combined_fn (code))
3963 : : {
3964 : : CASE_CFN_FMIN:
3965 : : CASE_CFN_FMAX:
3966 : : return initial_value;
3967 : :
3968 : 0 : default:
3969 : 0 : return NULL_TREE;
3970 : : }
3971 : : }
3972 : :
3973 : : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3974 : : STMT is printed with a message MSG. */
3975 : :
3976 : : static void
3977 : 471 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3978 : : {
3979 : 471 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3980 : 471 : }
3981 : :
3982 : : /* Return true if we need an in-order reduction for operation CODE
3983 : : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3984 : : overflow must wrap. */
3985 : :
3986 : : bool
3987 : 6250602 : needs_fold_left_reduction_p (tree type, code_helper code)
3988 : : {
3989 : : /* CHECKME: check for !flag_finite_math_only too? */
3990 : 6250602 : if (SCALAR_FLOAT_TYPE_P (type))
3991 : : {
3992 : 518635 : if (code.is_tree_code ())
3993 : 518585 : switch (tree_code (code))
3994 : : {
3995 : : case MIN_EXPR:
3996 : : case MAX_EXPR:
3997 : : return false;
3998 : :
3999 : 517125 : default:
4000 : 517125 : return !flag_associative_math;
4001 : : }
4002 : : else
4003 : 50 : switch (combined_fn (code))
4004 : : {
4005 : : CASE_CFN_FMIN:
4006 : : CASE_CFN_FMAX:
4007 : : return false;
4008 : :
4009 : 2 : default:
4010 : 2 : return !flag_associative_math;
4011 : : }
4012 : : }
4013 : :
4014 : 5731967 : if (INTEGRAL_TYPE_P (type))
4015 : 5731159 : return (!code.is_tree_code ()
4016 : 5731159 : || !operation_no_trapping_overflow (type, tree_code (code)));
4017 : :
4018 : 808 : if (SAT_FIXED_POINT_TYPE_P (type))
4019 : : return true;
4020 : :
4021 : : return false;
4022 : : }
4023 : :
4024 : : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4025 : : has a handled computation expression. Store the main reduction
4026 : : operation in *CODE. */
4027 : :
4028 : : static bool
4029 : 62720 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4030 : : tree loop_arg, code_helper *code,
4031 : : vec<std::pair<ssa_op_iter, use_operand_p> > &path,
4032 : : bool inner_loop_of_double_reduc)
4033 : : {
4034 : 62720 : auto_bitmap visited;
4035 : 62720 : tree lookfor = PHI_RESULT (phi);
4036 : 62720 : ssa_op_iter curri;
4037 : 62720 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4038 : 131942 : while (USE_FROM_PTR (curr) != loop_arg)
4039 : 6502 : curr = op_iter_next_use (&curri);
4040 : 62720 : curri.i = curri.numops;
4041 : 600129 : do
4042 : : {
4043 : 600129 : path.safe_push (std::make_pair (curri, curr));
4044 : 600129 : tree use = USE_FROM_PTR (curr);
4045 : 600129 : if (use == lookfor)
4046 : : break;
4047 : 537562 : gimple *def = SSA_NAME_DEF_STMT (use);
4048 : 537562 : if (gimple_nop_p (def)
4049 : 537562 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4050 : : {
4051 : 456221 : pop:
4052 : 456221 : do
4053 : : {
4054 : 456221 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4055 : 456221 : curri = x.first;
4056 : 456221 : curr = x.second;
4057 : 499990 : do
4058 : 499990 : curr = op_iter_next_use (&curri);
4059 : : /* Skip already visited or non-SSA operands (from iterating
4060 : : over PHI args). */
4061 : : while (curr != NULL_USE_OPERAND_P
4062 : 999980 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4063 : 172632 : || ! bitmap_set_bit (visited,
4064 : 172632 : SSA_NAME_VERSION
4065 : : (USE_FROM_PTR (curr)))));
4066 : : }
4067 : 912442 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4068 : 154205 : if (curr == NULL_USE_OPERAND_P)
4069 : : break;
4070 : : }
4071 : : else
4072 : : {
4073 : 451197 : if (gimple_code (def) == GIMPLE_PHI)
4074 : 47552 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4075 : : else
4076 : 403645 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4077 : : while (curr != NULL_USE_OPERAND_P
4078 : 544276 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4079 : 472649 : || ! bitmap_set_bit (visited,
4080 : 472649 : SSA_NAME_VERSION
4081 : : (USE_FROM_PTR (curr)))))
4082 : 93079 : curr = op_iter_next_use (&curri);
4083 : 451197 : if (curr == NULL_USE_OPERAND_P)
4084 : 67840 : goto pop;
4085 : : }
4086 : : }
4087 : : while (1);
4088 : 62720 : if (dump_file && (dump_flags & TDF_DETAILS))
4089 : : {
4090 : 3534 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4091 : 3534 : unsigned i;
4092 : 3534 : std::pair<ssa_op_iter, use_operand_p> *x;
4093 : 12135 : FOR_EACH_VEC_ELT (path, i, x)
4094 : 8601 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4095 : 3534 : dump_printf (MSG_NOTE, "\n");
4096 : : }
4097 : :
4098 : : /* Check whether the reduction path detected is valid. */
4099 : 62720 : bool fail = path.length () == 0;
4100 : 62720 : bool neg = false;
4101 : 62720 : int sign = -1;
4102 : 62720 : *code = ERROR_MARK;
4103 : 135197 : for (unsigned i = 1; i < path.length (); ++i)
4104 : : {
4105 : 75414 : gimple *use_stmt = USE_STMT (path[i].second);
4106 : 75414 : gimple_match_op op;
4107 : 75414 : if (!gimple_extract_op (use_stmt, &op))
4108 : : {
4109 : : fail = true;
4110 : 2937 : break;
4111 : : }
4112 : 74862 : unsigned int opi = op.num_ops;
4113 : 74862 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4114 : : {
4115 : : /* The following make sure we can compute the operand index
4116 : : easily plus it mostly disallows chaining via COND_EXPR condition
4117 : : operands. */
4118 : 120221 : for (opi = 0; opi < op.num_ops; ++opi)
4119 : 119282 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4120 : : break;
4121 : : }
4122 : 3224 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4123 : : {
4124 : 6464 : for (opi = 0; opi < op.num_ops; ++opi)
4125 : 6464 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4126 : : break;
4127 : : }
4128 : 74862 : if (opi == op.num_ops)
4129 : : {
4130 : : fail = true;
4131 : : break;
4132 : : }
4133 : 73923 : op.code = canonicalize_code (op.code, op.type);
4134 : 73923 : if (op.code == MINUS_EXPR)
4135 : : {
4136 : 3701 : op.code = PLUS_EXPR;
4137 : : /* Track whether we negate the reduction value each iteration. */
4138 : 3701 : if (op.ops[1] == op.ops[opi])
4139 : 32 : neg = ! neg;
4140 : : }
4141 : 70222 : else if (op.code == IFN_COND_SUB)
4142 : : {
4143 : 2 : op.code = IFN_COND_ADD;
4144 : : /* Track whether we negate the reduction value each iteration. */
4145 : 2 : if (op.ops[2] == op.ops[opi])
4146 : 0 : neg = ! neg;
4147 : : }
4148 : : /* For an FMA the reduction code is the PLUS if the addition chain
4149 : : is the reduction. */
4150 : 70220 : else if (op.code == IFN_FMA && opi == 2)
4151 : 24 : op.code = PLUS_EXPR;
4152 : 73923 : if (CONVERT_EXPR_CODE_P (op.code)
4153 : 73923 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4154 : : ;
4155 : 70450 : else if (*code == ERROR_MARK)
4156 : : {
4157 : 61186 : *code = op.code;
4158 : 61186 : sign = TYPE_SIGN (op.type);
4159 : : }
4160 : 9264 : else if (op.code != *code)
4161 : : {
4162 : : fail = true;
4163 : : break;
4164 : : }
4165 : 8028 : else if ((op.code == MIN_EXPR
4166 : 7884 : || op.code == MAX_EXPR)
4167 : 8034 : && sign != TYPE_SIGN (op.type))
4168 : : {
4169 : : fail = true;
4170 : : break;
4171 : : }
4172 : : /* Check there's only a single stmt the op is used on. For the
4173 : : not value-changing tail and the last stmt allow out-of-loop uses,
4174 : : but not when this is the inner loop of a double reduction.
4175 : : ??? We could relax this and handle arbitrary live stmts by
4176 : : forcing a scalar epilogue for example. */
4177 : 72684 : imm_use_iterator imm_iter;
4178 : 72684 : use_operand_p use_p;
4179 : 72684 : gimple *op_use_stmt;
4180 : 72684 : unsigned cnt = 0;
4181 : 75882 : bool cond_fn_p = op.code.is_internal_fn ()
4182 : 3198 : && (conditional_internal_fn_code (internal_fn (op.code))
4183 : 72684 : != ERROR_MARK);
4184 : :
4185 : 169983 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4186 : : {
4187 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
4188 : : have op1 twice (once as definition, once as else) in the same
4189 : : operation. Enforce this. */
4190 : 97299 : if (cond_fn_p && op_use_stmt == use_stmt)
4191 : : {
4192 : 3142 : gcall *call = as_a<gcall *> (use_stmt);
4193 : 3142 : unsigned else_pos
4194 : 3142 : = internal_fn_else_index (internal_fn (op.code));
4195 : 3142 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
4196 : : {
4197 : : fail = true;
4198 : : break;
4199 : : }
4200 : 15710 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4201 : : {
4202 : 12568 : if (j == else_pos)
4203 : 3142 : continue;
4204 : 9426 : if (gimple_call_arg (call, j) == op.ops[opi])
4205 : 3142 : cnt++;
4206 : : }
4207 : : }
4208 : 94157 : else if (!is_gimple_debug (op_use_stmt)
4209 : 94157 : && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
4210 : 1776 : || flow_bb_inside_loop_p (loop,
4211 : 1776 : gimple_bb (op_use_stmt))))
4212 : 139641 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4213 : 69825 : cnt++;
4214 : 72684 : }
4215 : :
4216 : 72684 : if (cnt != 1)
4217 : : {
4218 : : fail = true;
4219 : : break;
4220 : : }
4221 : : }
4222 : 65819 : return ! fail && ! neg && *code != ERROR_MARK;
4223 : 62720 : }
4224 : :
4225 : : bool
4226 : 19 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4227 : : tree loop_arg, enum tree_code code)
4228 : : {
4229 : 19 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4230 : 19 : code_helper code_;
4231 : 19 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
4232 : 19 : && code_ == code);
4233 : 19 : }
4234 : :
4235 : :
4236 : :
4237 : : /* Function vect_is_simple_reduction
4238 : :
4239 : : (1) Detect a cross-iteration def-use cycle that represents a simple
4240 : : reduction computation. We look for the following pattern:
4241 : :
4242 : : loop_header:
4243 : : a1 = phi < a0, a2 >
4244 : : a3 = ...
4245 : : a2 = operation (a3, a1)
4246 : :
4247 : : or
4248 : :
4249 : : a3 = ...
4250 : : loop_header:
4251 : : a1 = phi < a0, a2 >
4252 : : a2 = operation (a3, a1)
4253 : :
4254 : : such that:
4255 : : 1. operation is commutative and associative and it is safe to
4256 : : change the order of the computation
4257 : : 2. no uses for a2 in the loop (a2 is used out of the loop)
4258 : : 3. no uses of a1 in the loop besides the reduction operation
4259 : : 4. no uses of a1 outside the loop.
4260 : :
4261 : : Conditions 1,4 are tested here.
4262 : : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4263 : :
4264 : : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4265 : : nested cycles.
4266 : :
4267 : : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4268 : : reductions:
4269 : :
4270 : : a1 = phi < a0, a2 >
4271 : : inner loop (def of a3)
4272 : : a2 = phi < a3 >
4273 : :
4274 : : (4) Detect condition expressions, ie:
4275 : : for (int i = 0; i < N; i++)
4276 : : if (a[i] < val)
4277 : : ret_val = a[i];
4278 : :
4279 : : */
4280 : :
4281 : : static stmt_vec_info
4282 : 84502 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4283 : : bool *double_reduc, bool *reduc_chain_p, bool slp)
4284 : : {
4285 : 84502 : gphi *phi = as_a <gphi *> (phi_info->stmt);
4286 : 84502 : gimple *phi_use_stmt = NULL;
4287 : 84502 : imm_use_iterator imm_iter;
4288 : 84502 : use_operand_p use_p;
4289 : :
4290 : 84502 : *double_reduc = false;
4291 : 84502 : *reduc_chain_p = false;
4292 : 84502 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4293 : :
4294 : 84502 : tree phi_name = PHI_RESULT (phi);
4295 : : /* ??? If there are no uses of the PHI result the inner loop reduction
4296 : : won't be detected as possibly double-reduction by vectorizable_reduction
4297 : : because that tries to walk the PHI arg from the preheader edge which
4298 : : can be constant. See PR60382. */
4299 : 84502 : if (has_zero_uses (phi_name))
4300 : : return NULL;
4301 : 84399 : class loop *loop = (gimple_bb (phi))->loop_father;
4302 : 84399 : unsigned nphi_def_loop_uses = 0;
4303 : 208482 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4304 : : {
4305 : 128628 : gimple *use_stmt = USE_STMT (use_p);
4306 : 128628 : if (is_gimple_debug (use_stmt))
4307 : 31040 : continue;
4308 : :
4309 : 97588 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4310 : : {
4311 : 4545 : if (dump_enabled_p ())
4312 : 53 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4313 : : "intermediate value used outside loop.\n");
4314 : :
4315 : 4545 : return NULL;
4316 : : }
4317 : :
4318 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4319 : : op1 twice (once as definition, once as else) in the same operation.
4320 : : Only count it as one. */
4321 : 93043 : if (use_stmt != phi_use_stmt)
4322 : : {
4323 : 89540 : nphi_def_loop_uses++;
4324 : 89540 : phi_use_stmt = use_stmt;
4325 : : }
4326 : : }
4327 : :
4328 : 79854 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4329 : 79854 : if (TREE_CODE (latch_def) != SSA_NAME)
4330 : : {
4331 : 1210 : if (dump_enabled_p ())
4332 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4333 : : "reduction: not ssa_name: %T\n", latch_def);
4334 : 1210 : return NULL;
4335 : : }
4336 : :
4337 : 78644 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4338 : 78644 : if (!def_stmt_info
4339 : 78644 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4340 : 135 : return NULL;
4341 : :
4342 : 78509 : bool nested_in_vect_loop
4343 : 78509 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4344 : 78509 : unsigned nlatch_def_loop_uses = 0;
4345 : 78509 : auto_vec<gphi *, 3> lcphis;
4346 : 78509 : bool inner_loop_of_double_reduc = false;
4347 : 294853 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4348 : : {
4349 : 216344 : gimple *use_stmt = USE_STMT (use_p);
4350 : 216344 : if (is_gimple_debug (use_stmt))
4351 : 59477 : continue;
4352 : 156867 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4353 : 86949 : nlatch_def_loop_uses++;
4354 : : else
4355 : : {
4356 : : /* We can have more than one loop-closed PHI. */
4357 : 69918 : lcphis.safe_push (as_a <gphi *> (use_stmt));
4358 : 69918 : if (nested_in_vect_loop
4359 : 69918 : && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4360 : : == vect_double_reduction_def))
4361 : : inner_loop_of_double_reduc = true;
4362 : : }
4363 : : }
4364 : :
4365 : : /* If we are vectorizing an inner reduction we are executing that
4366 : : in the original order only in case we are not dealing with a
4367 : : double reduction. */
4368 : 78509 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4369 : : {
4370 : 2137 : if (dump_enabled_p ())
4371 : 370 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4372 : : "detected nested cycle: ");
4373 : 2137 : return def_stmt_info;
4374 : : }
4375 : :
4376 : : /* When the inner loop of a double reduction ends up with more than
4377 : : one loop-closed PHI we have failed to classify alternate such
4378 : : PHIs as double reduction, leading to wrong code. See PR103237. */
4379 : 77326 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
4380 : : {
4381 : 1 : if (dump_enabled_p ())
4382 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4383 : : "unhandle double reduction\n");
4384 : 1 : return NULL;
4385 : : }
4386 : :
4387 : : /* If this isn't a nested cycle or if the nested cycle reduction value
4388 : : is used ouside of the inner loop we cannot handle uses of the reduction
4389 : : value. */
4390 : 76371 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4391 : : {
4392 : 12538 : if (dump_enabled_p ())
4393 : 311 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4394 : : "reduction used in loop.\n");
4395 : 12538 : return NULL;
4396 : : }
4397 : :
4398 : : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4399 : : defined in the inner loop. */
4400 : 63833 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4401 : : {
4402 : 1132 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
4403 : 1132 : if (gimple_phi_num_args (def_stmt) != 1
4404 : 1132 : || TREE_CODE (op1) != SSA_NAME)
4405 : : {
4406 : 52 : if (dump_enabled_p ())
4407 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4408 : : "unsupported phi node definition.\n");
4409 : :
4410 : 52 : return NULL;
4411 : : }
4412 : :
4413 : : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4414 : : and the latch definition op1. */
4415 : 1080 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
4416 : 1080 : if (gimple_bb (def1)
4417 : 1080 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4418 : 1080 : && loop->inner
4419 : 1072 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4420 : 1072 : && (is_gimple_assign (def1) || is_gimple_call (def1))
4421 : 1063 : && is_a <gphi *> (phi_use_stmt)
4422 : 1052 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4423 : 1052 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4424 : : loop_latch_edge (loop->inner)))
4425 : 2130 : && lcphis.length () == 1)
4426 : : {
4427 : 966 : if (dump_enabled_p ())
4428 : 101 : report_vect_op (MSG_NOTE, def_stmt,
4429 : : "detected double reduction: ");
4430 : :
4431 : 966 : *double_reduc = true;
4432 : 966 : return def_stmt_info;
4433 : : }
4434 : :
4435 : 114 : return NULL;
4436 : : }
4437 : :
4438 : : /* Look for the expression computing latch_def from then loop PHI result. */
4439 : 62701 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4440 : 62701 : code_helper code;
4441 : 62701 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4442 : : path, inner_loop_of_double_reduc))
4443 : : {
4444 : 59602 : STMT_VINFO_REDUC_CODE (phi_info) = code;
4445 : 59602 : if (code == COND_EXPR && !nested_in_vect_loop)
4446 : 4101 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4447 : :
4448 : : /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4449 : : reduction chain for which the additional restriction is that
4450 : : all operations in the chain are the same. */
4451 : 59602 : auto_vec<stmt_vec_info, 8> reduc_chain;
4452 : 59602 : unsigned i;
4453 : 59602 : bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4454 : 190109 : for (i = path.length () - 1; i >= 1; --i)
4455 : : {
4456 : 70905 : gimple *stmt = USE_STMT (path[i].second);
4457 : 70905 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4458 : 70905 : gimple_match_op op;
4459 : 70905 : if (!gimple_extract_op (stmt, &op))
4460 : 0 : gcc_unreachable ();
4461 : 70905 : if (gassign *assign = dyn_cast<gassign *> (stmt))
4462 : 67701 : STMT_VINFO_REDUC_IDX (stmt_info)
4463 : 67701 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4464 : : else
4465 : : {
4466 : 3204 : gcall *call = as_a<gcall *> (stmt);
4467 : 3204 : STMT_VINFO_REDUC_IDX (stmt_info)
4468 : 3204 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
4469 : : }
4470 : 70905 : bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4471 : 70905 : && (i == 1 || i == path.length () - 1));
4472 : 7015 : if ((op.code != code && !leading_conversion)
4473 : : /* We can only handle the final value in epilogue
4474 : : generation for reduction chains. */
4475 : 74280 : || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4476 : : is_slp_reduc = false;
4477 : : /* For reduction chains we support a trailing/leading
4478 : : conversions. We do not store those in the actual chain. */
4479 : 70905 : if (leading_conversion)
4480 : 3375 : continue;
4481 : 67530 : reduc_chain.safe_push (stmt_info);
4482 : : }
4483 : 110581 : if (slp && is_slp_reduc && reduc_chain.length () > 1)
4484 : : {
4485 : 3377 : for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4486 : : {
4487 : 2622 : REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4488 : 2622 : REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4489 : : }
4490 : 755 : REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4491 : 755 : REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4492 : :
4493 : : /* Save the chain for further analysis in SLP detection. */
4494 : 755 : LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4495 : 1510 : REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4496 : :
4497 : 755 : *reduc_chain_p = true;
4498 : 755 : if (dump_enabled_p ())
4499 : 266 : dump_printf_loc (MSG_NOTE, vect_location,
4500 : : "reduction: detected reduction chain\n");
4501 : : }
4502 : 58847 : else if (dump_enabled_p ())
4503 : 3191 : dump_printf_loc (MSG_NOTE, vect_location,
4504 : : "reduction: detected reduction\n");
4505 : :
4506 : 59602 : return def_stmt_info;
4507 : 59602 : }
4508 : :
4509 : 3099 : if (dump_enabled_p ())
4510 : 91 : dump_printf_loc (MSG_NOTE, vect_location,
4511 : : "reduction: unknown pattern\n");
4512 : :
4513 : : return NULL;
4514 : 141210 : }
4515 : :
4516 : : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4517 : : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4518 : : or -1 if not known. */
4519 : :
4520 : : static int
4521 : 360907 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4522 : : {
4523 : 360907 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
4524 : 360907 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4525 : : {
4526 : 140803 : if (dump_enabled_p ())
4527 : 2743 : dump_printf_loc (MSG_NOTE, vect_location,
4528 : : "cost model: epilogue peel iters set to vf/2 "
4529 : : "because loop iterations are unknown .\n");
4530 : 140803 : return assumed_vf / 2;
4531 : : }
4532 : : else
4533 : : {
4534 : 220104 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4535 : 220104 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
4536 : 220104 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4537 : : /* If we need to peel for gaps, but no peeling is required, we have to
4538 : : peel VF iterations. */
4539 : 220104 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4540 : 220104 : peel_iters_epilogue = assumed_vf;
4541 : 220104 : return peel_iters_epilogue;
4542 : : }
4543 : : }
4544 : :
4545 : : /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4546 : : int
4547 : 282401 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4548 : : int *peel_iters_epilogue,
4549 : : stmt_vector_for_cost *scalar_cost_vec,
4550 : : stmt_vector_for_cost *prologue_cost_vec,
4551 : : stmt_vector_for_cost *epilogue_cost_vec)
4552 : : {
4553 : 282401 : int retval = 0;
4554 : :
4555 : 282401 : *peel_iters_epilogue
4556 : 282401 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4557 : :
4558 : 282401 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4559 : : {
4560 : : /* If peeled iterations are known but number of scalar loop
4561 : : iterations are unknown, count a taken branch per peeled loop. */
4562 : 94152 : if (peel_iters_prologue > 0)
4563 : 54654 : retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4564 : : vect_prologue);
4565 : 94152 : if (*peel_iters_epilogue > 0)
4566 : 94080 : retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4567 : : vect_epilogue);
4568 : : }
4569 : :
4570 : 282401 : stmt_info_for_cost *si;
4571 : 282401 : int j;
4572 : 282401 : if (peel_iters_prologue)
4573 : 686953 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4574 : 570917 : retval += record_stmt_cost (prologue_cost_vec,
4575 : 570917 : si->count * peel_iters_prologue,
4576 : : si->kind, si->stmt_info, si->misalign,
4577 : : vect_prologue);
4578 : 282401 : if (*peel_iters_epilogue)
4579 : 1114850 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4580 : 925262 : retval += record_stmt_cost (epilogue_cost_vec,
4581 : 925262 : si->count * *peel_iters_epilogue,
4582 : : si->kind, si->stmt_info, si->misalign,
4583 : : vect_epilogue);
4584 : :
4585 : 282401 : return retval;
4586 : : }
4587 : :
4588 : : /* Function vect_estimate_min_profitable_iters
4589 : :
4590 : : Return the number of iterations required for the vector version of the
4591 : : loop to be profitable relative to the cost of the scalar version of the
4592 : : loop.
4593 : :
4594 : : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4595 : : of iterations for vectorization. -1 value means loop vectorization
4596 : : is not profitable. This returned value may be used for dynamic
4597 : : profitability check.
4598 : :
4599 : : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4600 : : for static check against estimated number of iterations. */
4601 : :
4602 : : static void
4603 : 94547 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4604 : : int *ret_min_profitable_niters,
4605 : : int *ret_min_profitable_estimate,
4606 : : unsigned *suggested_unroll_factor)
4607 : : {
4608 : 94547 : int min_profitable_iters;
4609 : 94547 : int min_profitable_estimate;
4610 : 94547 : int peel_iters_prologue;
4611 : 94547 : int peel_iters_epilogue;
4612 : 94547 : unsigned vec_inside_cost = 0;
4613 : 94547 : int vec_outside_cost = 0;
4614 : 94547 : unsigned vec_prologue_cost = 0;
4615 : 94547 : unsigned vec_epilogue_cost = 0;
4616 : 94547 : int scalar_single_iter_cost = 0;
4617 : 94547 : int scalar_outside_cost = 0;
4618 : 94547 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
4619 : 94547 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4620 : 94547 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
4621 : :
4622 : : /* Cost model disabled. */
4623 : 94547 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4624 : : {
4625 : 15860 : if (dump_enabled_p ())
4626 : 9672 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4627 : 15860 : *ret_min_profitable_niters = 0;
4628 : 15860 : *ret_min_profitable_estimate = 0;
4629 : 15860 : return;
4630 : : }
4631 : :
4632 : : /* Requires loop versioning tests to handle misalignment. */
4633 : 78687 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4634 : : {
4635 : : /* FIXME: Make cost depend on complexity of individual check. */
4636 : 24 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4637 : 24 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4638 : 24 : if (dump_enabled_p ())
4639 : 1 : dump_printf (MSG_NOTE,
4640 : : "cost model: Adding cost of checks for loop "
4641 : : "versioning to treat misalignment.\n");
4642 : : }
4643 : :
4644 : : /* Requires loop versioning with alias checks. */
4645 : 78687 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4646 : : {
4647 : : /* FIXME: Make cost depend on complexity of individual check. */
4648 : 3978 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4649 : 3978 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4650 : 3978 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4651 : 0 : if (len)
4652 : : /* Count LEN - 1 ANDs and LEN comparisons. */
4653 : 0 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4654 : : scalar_stmt, vect_prologue);
4655 : 3978 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4656 : 1102 : if (len)
4657 : : {
4658 : : /* Count LEN - 1 ANDs and LEN comparisons. */
4659 : 1102 : unsigned int nstmts = len * 2 - 1;
4660 : : /* +1 for each bias that needs adding. */
4661 : 2204 : for (unsigned int i = 0; i < len; ++i)
4662 : 1102 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4663 : 119 : nstmts += 1;
4664 : 1102 : (void) add_stmt_cost (target_cost_data, nstmts,
4665 : : scalar_stmt, vect_prologue);
4666 : : }
4667 : 3978 : if (dump_enabled_p ())
4668 : 14 : dump_printf (MSG_NOTE,
4669 : : "cost model: Adding cost of checks for loop "
4670 : : "versioning aliasing.\n");
4671 : : }
4672 : :
4673 : : /* Requires loop versioning with niter checks. */
4674 : 78687 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4675 : : {
4676 : : /* FIXME: Make cost depend on complexity of individual check. */
4677 : 695 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4678 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4679 : 695 : if (dump_enabled_p ())
4680 : 1 : dump_printf (MSG_NOTE,
4681 : : "cost model: Adding cost of checks for loop "
4682 : : "versioning niters.\n");
4683 : : }
4684 : :
4685 : 78687 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4686 : 4681 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4687 : : vect_prologue);
4688 : :
4689 : : /* Count statements in scalar loop. Using this as scalar cost for a single
4690 : : iteration for now.
4691 : :
4692 : : TODO: Add outer loop support.
4693 : :
4694 : : TODO: Consider assigning different costs to different scalar
4695 : : statements. */
4696 : :
4697 : 78687 : scalar_single_iter_cost = (loop_vinfo->scalar_costs->total_cost ()
4698 : 78687 : * param_vect_scalar_cost_multiplier) / 100;
4699 : :
4700 : : /* Add additional cost for the peeled instructions in prologue and epilogue
4701 : : loop. (For fully-masked loops there will be no peeling.)
4702 : :
4703 : : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4704 : : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4705 : :
4706 : : TODO: Build an expression that represents peel_iters for prologue and
4707 : : epilogue to be used in a run-time test. */
4708 : :
4709 : 78687 : bool prologue_need_br_taken_cost = false;
4710 : 78687 : bool prologue_need_br_not_taken_cost = false;
4711 : :
4712 : : /* Calculate peel_iters_prologue. */
4713 : 78687 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4714 : : peel_iters_prologue = 0;
4715 : 78687 : else if (npeel < 0)
4716 : : {
4717 : 166 : peel_iters_prologue = assumed_vf / 2;
4718 : 166 : if (dump_enabled_p ())
4719 : 4 : dump_printf (MSG_NOTE, "cost model: "
4720 : : "prologue peel iters set to vf/2.\n");
4721 : :
4722 : : /* If peeled iterations are unknown, count a taken branch and a not taken
4723 : : branch per peeled loop. Even if scalar loop iterations are known,
4724 : : vector iterations are not known since peeled prologue iterations are
4725 : : not known. Hence guards remain the same. */
4726 : : prologue_need_br_taken_cost = true;
4727 : : prologue_need_br_not_taken_cost = true;
4728 : : }
4729 : : else
4730 : : {
4731 : 78521 : peel_iters_prologue = npeel;
4732 : 78521 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4733 : : /* If peeled iterations are known but number of scalar loop
4734 : : iterations are unknown, count a taken branch per peeled loop. */
4735 : 78687 : prologue_need_br_taken_cost = true;
4736 : : }
4737 : :
4738 : 78687 : bool epilogue_need_br_taken_cost = false;
4739 : 78687 : bool epilogue_need_br_not_taken_cost = false;
4740 : :
4741 : : /* Calculate peel_iters_epilogue. */
4742 : 78687 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4743 : : /* We need to peel exactly one iteration for gaps. */
4744 : 15 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4745 : 78672 : else if (npeel < 0)
4746 : : {
4747 : : /* If peeling for alignment is unknown, loop bound of main loop
4748 : : becomes unknown. */
4749 : 166 : peel_iters_epilogue = assumed_vf / 2;
4750 : 166 : if (dump_enabled_p ())
4751 : 4 : dump_printf (MSG_NOTE, "cost model: "
4752 : : "epilogue peel iters set to vf/2 because "
4753 : : "peeling for alignment is unknown.\n");
4754 : :
4755 : : /* See the same reason above in peel_iters_prologue calculation. */
4756 : : epilogue_need_br_taken_cost = true;
4757 : : epilogue_need_br_not_taken_cost = true;
4758 : : }
4759 : : else
4760 : : {
4761 : 78506 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4762 : 78506 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4763 : : /* If peeled iterations are known but number of scalar loop
4764 : : iterations are unknown, count a taken branch per peeled loop. */
4765 : 78687 : epilogue_need_br_taken_cost = true;
4766 : : }
4767 : :
4768 : 78687 : stmt_info_for_cost *si;
4769 : 78687 : int j;
4770 : : /* Add costs associated with peel_iters_prologue. */
4771 : 78687 : if (peel_iters_prologue)
4772 : 791 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4773 : : {
4774 : 616 : (void) add_stmt_cost (target_cost_data,
4775 : 616 : si->count * peel_iters_prologue, si->kind,
4776 : : si->stmt_info, si->node, si->vectype,
4777 : : si->misalign, vect_prologue);
4778 : : }
4779 : :
4780 : : /* Add costs associated with peel_iters_epilogue. */
4781 : 78687 : if (peel_iters_epilogue)
4782 : 267913 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4783 : : {
4784 : 212670 : (void) add_stmt_cost (target_cost_data,
4785 : 212670 : si->count * peel_iters_epilogue, si->kind,
4786 : : si->stmt_info, si->node, si->vectype,
4787 : : si->misalign, vect_epilogue);
4788 : : }
4789 : :
4790 : : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4791 : :
4792 : 78687 : if (prologue_need_br_taken_cost)
4793 : 167 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4794 : : vect_prologue);
4795 : :
4796 : 78687 : if (prologue_need_br_not_taken_cost)
4797 : 166 : (void) add_stmt_cost (target_cost_data, 1,
4798 : : cond_branch_not_taken, vect_prologue);
4799 : :
4800 : 78687 : if (epilogue_need_br_taken_cost)
4801 : 46279 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4802 : : vect_epilogue);
4803 : :
4804 : 78687 : if (epilogue_need_br_not_taken_cost)
4805 : 166 : (void) add_stmt_cost (target_cost_data, 1,
4806 : : cond_branch_not_taken, vect_epilogue);
4807 : :
4808 : : /* Take care of special costs for rgroup controls of partial vectors. */
4809 : 15 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4810 : 78702 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4811 : : == vect_partial_vectors_avx512))
4812 : : {
4813 : : /* Calculate how many masks we need to generate. */
4814 : 15 : unsigned int num_masks = 0;
4815 : 15 : bool need_saturation = false;
4816 : 62 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4817 : 17 : if (rgm.type)
4818 : : {
4819 : 15 : unsigned nvectors = rgm.factor;
4820 : 15 : num_masks += nvectors;
4821 : 15 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4822 : 15 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4823 : 4 : need_saturation = true;
4824 : : }
4825 : :
4826 : : /* ??? The target isn't able to identify the costs below as
4827 : : producing masks so it cannot penaltize cases where we'd run
4828 : : out of mask registers for example. */
4829 : :
4830 : : /* ??? We are also failing to account for smaller vector masks
4831 : : we generate by splitting larger masks in vect_get_loop_mask. */
4832 : :
4833 : : /* In the worst case, we need to generate each mask in the prologue
4834 : : and in the loop body. We need one splat per group and one
4835 : : compare per mask.
4836 : :
4837 : : Sometimes the prologue mask will fold to a constant,
4838 : : so the actual prologue cost might be smaller. However, it's
4839 : : simpler and safer to use the worst-case cost; if this ends up
4840 : : being the tie-breaker between vectorizing or not, then it's
4841 : : probably better not to vectorize. */
4842 : 15 : (void) add_stmt_cost (target_cost_data,
4843 : : num_masks
4844 : 15 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4845 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4846 : : vect_prologue);
4847 : 30 : (void) add_stmt_cost (target_cost_data,
4848 : : num_masks
4849 : 30 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4850 : : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4851 : :
4852 : : /* When we need saturation we need it both in the prologue and
4853 : : the epilogue. */
4854 : 15 : if (need_saturation)
4855 : : {
4856 : 4 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4857 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4858 : 4 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4859 : : NULL, NULL, NULL_TREE, 0, vect_body);
4860 : : }
4861 : : }
4862 : 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4863 : 78672 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4864 : : == vect_partial_vectors_while_ult))
4865 : : {
4866 : : /* Calculate how many masks we need to generate. */
4867 : : unsigned int num_masks = 0;
4868 : : rgroup_controls *rgm;
4869 : : unsigned int num_vectors_m1;
4870 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4871 : : num_vectors_m1, rgm)
4872 : 0 : if (rgm->type)
4873 : 0 : num_masks += num_vectors_m1 + 1;
4874 : 0 : gcc_assert (num_masks > 0);
4875 : :
4876 : : /* In the worst case, we need to generate each mask in the prologue
4877 : : and in the loop body. One of the loop body mask instructions
4878 : : replaces the comparison in the scalar loop, and since we don't
4879 : : count the scalar comparison against the scalar body, we shouldn't
4880 : : count that vector instruction against the vector body either.
4881 : :
4882 : : Sometimes we can use unpacks instead of generating prologue
4883 : : masks and sometimes the prologue mask will fold to a constant,
4884 : : so the actual prologue cost might be smaller. However, it's
4885 : : simpler and safer to use the worst-case cost; if this ends up
4886 : : being the tie-breaker between vectorizing or not, then it's
4887 : : probably better not to vectorize. */
4888 : 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4889 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4890 : : vect_prologue);
4891 : 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4892 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4893 : : vect_body);
4894 : : }
4895 : 78672 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4896 : : {
4897 : : /* Referring to the functions vect_set_loop_condition_partial_vectors
4898 : : and vect_set_loop_controls_directly, we need to generate each
4899 : : length in the prologue and in the loop body if required. Although
4900 : : there are some possible optimizations, we consider the worst case
4901 : : here. */
4902 : :
4903 : 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4904 : 0 : signed char partial_load_store_bias
4905 : : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4906 : 0 : bool need_iterate_p
4907 : 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4908 : 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4909 : :
4910 : : /* Calculate how many statements to be added. */
4911 : 0 : unsigned int prologue_stmts = 0;
4912 : 0 : unsigned int body_stmts = 0;
4913 : :
4914 : 0 : rgroup_controls *rgc;
4915 : 0 : unsigned int num_vectors_m1;
4916 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4917 : 0 : if (rgc->type)
4918 : : {
4919 : : /* May need one SHIFT for nitems_total computation. */
4920 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4921 : 0 : if (nitems != 1 && !niters_known_p)
4922 : 0 : prologue_stmts += 1;
4923 : :
4924 : : /* May need one MAX and one MINUS for wrap around. */
4925 : 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4926 : 0 : prologue_stmts += 2;
4927 : :
4928 : : /* Need one MAX and one MINUS for each batch limit excepting for
4929 : : the 1st one. */
4930 : 0 : prologue_stmts += num_vectors_m1 * 2;
4931 : :
4932 : 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4933 : :
4934 : : /* Need to set up lengths in prologue, only one MIN required
4935 : : for each since start index is zero. */
4936 : 0 : prologue_stmts += num_vectors;
4937 : :
4938 : : /* If we have a non-zero partial load bias, we need one PLUS
4939 : : to adjust the load length. */
4940 : 0 : if (partial_load_store_bias != 0)
4941 : 0 : body_stmts += 1;
4942 : :
4943 : 0 : unsigned int length_update_cost = 0;
4944 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4945 : : /* For decrement IV style, Each only need a single SELECT_VL
4946 : : or MIN since beginning to calculate the number of elements
4947 : : need to be processed in current iteration. */
4948 : : length_update_cost = 1;
4949 : : else
4950 : : /* For increment IV stype, Each may need two MINs and one MINUS to
4951 : : update lengths in body for next iteration. */
4952 : 0 : length_update_cost = 3;
4953 : :
4954 : 0 : if (need_iterate_p)
4955 : 0 : body_stmts += length_update_cost * num_vectors;
4956 : : }
4957 : :
4958 : 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4959 : : scalar_stmt, vect_prologue);
4960 : 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4961 : : scalar_stmt, vect_body);
4962 : : }
4963 : :
4964 : : /* FORNOW: The scalar outside cost is incremented in one of the
4965 : : following ways:
4966 : :
4967 : : 1. The vectorizer checks for alignment and aliasing and generates
4968 : : a condition that allows dynamic vectorization. A cost model
4969 : : check is ANDED with the versioning condition. Hence scalar code
4970 : : path now has the added cost of the versioning check.
4971 : :
4972 : : if (cost > th & versioning_check)
4973 : : jmp to vector code
4974 : :
4975 : : Hence run-time scalar is incremented by not-taken branch cost.
4976 : :
4977 : : 2. The vectorizer then checks if a prologue is required. If the
4978 : : cost model check was not done before during versioning, it has to
4979 : : be done before the prologue check.
4980 : :
4981 : : if (cost <= th)
4982 : : prologue = scalar_iters
4983 : : if (prologue == 0)
4984 : : jmp to vector code
4985 : : else
4986 : : execute prologue
4987 : : if (prologue == num_iters)
4988 : : go to exit
4989 : :
4990 : : Hence the run-time scalar cost is incremented by a taken branch,
4991 : : plus a not-taken branch, plus a taken branch cost.
4992 : :
4993 : : 3. The vectorizer then checks if an epilogue is required. If the
4994 : : cost model check was not done before during prologue check, it
4995 : : has to be done with the epilogue check.
4996 : :
4997 : : if (prologue == 0)
4998 : : jmp to vector code
4999 : : else
5000 : : execute prologue
5001 : : if (prologue == num_iters)
5002 : : go to exit
5003 : : vector code:
5004 : : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
5005 : : jmp to epilogue
5006 : :
5007 : : Hence the run-time scalar cost should be incremented by 2 taken
5008 : : branches.
5009 : :
5010 : : TODO: The back end may reorder the BBS's differently and reverse
5011 : : conditions/branch directions. Change the estimates below to
5012 : : something more reasonable. */
5013 : :
5014 : : /* If the number of iterations is known and we do not do versioning, we can
5015 : : decide whether to vectorize at compile time. Hence the scalar version
5016 : : do not carry cost model guard costs. */
5017 : 31892 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5018 : 110579 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
5019 : : {
5020 : : /* Cost model check occurs at versioning. */
5021 : 47388 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
5022 : 4681 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
5023 : : else
5024 : : {
5025 : : /* Cost model check occurs at prologue generation. */
5026 : 42707 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
5027 : 26 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
5028 : 26 : + vect_get_stmt_cost (cond_branch_not_taken);
5029 : : /* Cost model check occurs at epilogue generation. */
5030 : : else
5031 : 42681 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
5032 : : }
5033 : : }
5034 : :
5035 : : /* Complete the target-specific cost calculations. */
5036 : 78687 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
5037 : 78687 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
5038 : 78687 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
5039 : 78687 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
5040 : 78687 : if (suggested_unroll_factor)
5041 : 78681 : *suggested_unroll_factor
5042 : 78681 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
5043 : :
5044 : 78681 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
5045 : 0 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5046 : 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5047 : : *suggested_unroll_factor,
5048 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5049 : : {
5050 : 0 : if (dump_enabled_p ())
5051 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5052 : : "can't unroll as unrolled vectorization factor larger"
5053 : : " than maximum vectorization factor: "
5054 : : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5055 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5056 : 0 : *suggested_unroll_factor = 1;
5057 : : }
5058 : :
5059 : 78687 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5060 : :
5061 : 78687 : if (dump_enabled_p ())
5062 : : {
5063 : 604 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5064 : 604 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
5065 : : vec_inside_cost);
5066 : 604 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
5067 : : vec_prologue_cost);
5068 : 604 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
5069 : : vec_epilogue_cost);
5070 : 604 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
5071 : : scalar_single_iter_cost);
5072 : 604 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
5073 : : scalar_outside_cost);
5074 : 604 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
5075 : : vec_outside_cost);
5076 : 604 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
5077 : : peel_iters_prologue);
5078 : 604 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
5079 : : peel_iters_epilogue);
5080 : : }
5081 : :
5082 : : /* Calculate number of iterations required to make the vector version
5083 : : profitable, relative to the loop bodies only. The following condition
5084 : : must hold true:
5085 : : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5086 : : where
5087 : : SIC = scalar iteration cost, VIC = vector iteration cost,
5088 : : VOC = vector outside cost, VF = vectorization factor,
5089 : : NPEEL = prologue iterations + epilogue iterations,
5090 : : SOC = scalar outside cost for run time cost model check. */
5091 : :
5092 : 78687 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5093 : 78687 : - vec_inside_cost);
5094 : 78687 : if (saving_per_viter <= 0)
5095 : : {
5096 : 24979 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5097 : 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5098 : : "vectorization did not happen for a simd loop");
5099 : :
5100 : 24979 : if (dump_enabled_p ())
5101 : 18 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5102 : : "cost model: the vector iteration cost = %d "
5103 : : "divided by the scalar iteration cost = %d "
5104 : : "is greater or equal to the vectorization factor = %d"
5105 : : ".\n",
5106 : : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5107 : 24979 : *ret_min_profitable_niters = -1;
5108 : 24979 : *ret_min_profitable_estimate = -1;
5109 : 24979 : return;
5110 : : }
5111 : :
5112 : : /* ??? The "if" arm is written to handle all cases; see below for what
5113 : : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5114 : 53708 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5115 : : {
5116 : : /* Rewriting the condition above in terms of the number of
5117 : : vector iterations (vniters) rather than the number of
5118 : : scalar iterations (niters) gives:
5119 : :
5120 : : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5121 : :
5122 : : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5123 : :
5124 : : For integer N, X and Y when X > 0:
5125 : :
5126 : : N * X > Y <==> N >= (Y /[floor] X) + 1. */
5127 : 9 : int outside_overhead = (vec_outside_cost
5128 : 9 : - scalar_single_iter_cost * peel_iters_prologue
5129 : 9 : - scalar_single_iter_cost * peel_iters_epilogue
5130 : : - scalar_outside_cost);
5131 : : /* We're only interested in cases that require at least one
5132 : : vector iteration. */
5133 : 9 : int min_vec_niters = 1;
5134 : 9 : if (outside_overhead > 0)
5135 : 7 : min_vec_niters = outside_overhead / saving_per_viter + 1;
5136 : :
5137 : 9 : if (dump_enabled_p ())
5138 : 2 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5139 : : min_vec_niters);
5140 : :
5141 : 9 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5142 : : {
5143 : : /* Now that we know the minimum number of vector iterations,
5144 : : find the minimum niters for which the scalar cost is larger:
5145 : :
5146 : : SIC * niters > VIC * vniters + VOC - SOC
5147 : :
5148 : : We know that the minimum niters is no more than
5149 : : vniters * VF + NPEEL, but it might be (and often is) less
5150 : : than that if a partial vector iteration is cheaper than the
5151 : : equivalent scalar code. */
5152 : 9 : int threshold = (vec_inside_cost * min_vec_niters
5153 : 9 : + vec_outside_cost
5154 : 9 : - scalar_outside_cost);
5155 : 9 : if (threshold <= 0)
5156 : : min_profitable_iters = 1;
5157 : : else
5158 : 9 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5159 : : }
5160 : : else
5161 : : /* Convert the number of vector iterations into a number of
5162 : : scalar iterations. */
5163 : 0 : min_profitable_iters = (min_vec_niters * assumed_vf
5164 : 0 : + peel_iters_prologue
5165 : : + peel_iters_epilogue);
5166 : : }
5167 : : else
5168 : : {
5169 : 53699 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5170 : 53699 : * assumed_vf
5171 : 53699 : - vec_inside_cost * peel_iters_prologue
5172 : 53699 : - vec_inside_cost * peel_iters_epilogue);
5173 : 53699 : if (min_profitable_iters <= 0)
5174 : : min_profitable_iters = 0;
5175 : : else
5176 : : {
5177 : 44909 : min_profitable_iters /= saving_per_viter;
5178 : :
5179 : 44909 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5180 : 44909 : <= (((int) vec_inside_cost * min_profitable_iters)
5181 : 44909 : + (((int) vec_outside_cost - scalar_outside_cost)
5182 : : * assumed_vf)))
5183 : 44909 : min_profitable_iters++;
5184 : : }
5185 : : }
5186 : :
5187 : 53708 : if (dump_enabled_p ())
5188 : 586 : dump_printf (MSG_NOTE,
5189 : : " Calculated minimum iters for profitability: %d\n",
5190 : : min_profitable_iters);
5191 : :
5192 : 53708 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5193 : 53699 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5194 : : /* We want the vectorized loop to execute at least once. */
5195 : : min_profitable_iters = assumed_vf + peel_iters_prologue;
5196 : 9781 : else if (min_profitable_iters < peel_iters_prologue)
5197 : : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5198 : : vectorized loop executes at least once. */
5199 : : min_profitable_iters = peel_iters_prologue;
5200 : :
5201 : 53708 : if (dump_enabled_p ())
5202 : 586 : dump_printf_loc (MSG_NOTE, vect_location,
5203 : : " Runtime profitability threshold = %d\n",
5204 : : min_profitable_iters);
5205 : :
5206 : 53708 : *ret_min_profitable_niters = min_profitable_iters;
5207 : :
5208 : : /* Calculate number of iterations required to make the vector version
5209 : : profitable, relative to the loop bodies only.
5210 : :
5211 : : Non-vectorized variant is SIC * niters and it must win over vector
5212 : : variant on the expected loop trip count. The following condition must hold true:
5213 : : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5214 : :
5215 : 53708 : if (vec_outside_cost <= 0)
5216 : : min_profitable_estimate = 0;
5217 : : /* ??? This "else if" arm is written to handle all cases; see below for
5218 : : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5219 : 48476 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5220 : : {
5221 : : /* This is a repeat of the code above, but with + SOC rather
5222 : : than - SOC. */
5223 : 9 : int outside_overhead = (vec_outside_cost
5224 : 9 : - scalar_single_iter_cost * peel_iters_prologue
5225 : 9 : - scalar_single_iter_cost * peel_iters_epilogue
5226 : : + scalar_outside_cost);
5227 : 9 : int min_vec_niters = 1;
5228 : 9 : if (outside_overhead > 0)
5229 : 9 : min_vec_niters = outside_overhead / saving_per_viter + 1;
5230 : :
5231 : 9 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5232 : : {
5233 : 9 : int threshold = (vec_inside_cost * min_vec_niters
5234 : 9 : + vec_outside_cost
5235 : 9 : + scalar_outside_cost);
5236 : 9 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5237 : : }
5238 : : else
5239 : : min_profitable_estimate = (min_vec_niters * assumed_vf
5240 : : + peel_iters_prologue
5241 : : + peel_iters_epilogue);
5242 : : }
5243 : : else
5244 : : {
5245 : 48467 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5246 : 48467 : * assumed_vf
5247 : 48467 : - vec_inside_cost * peel_iters_prologue
5248 : 48467 : - vec_inside_cost * peel_iters_epilogue)
5249 : 48467 : / ((scalar_single_iter_cost * assumed_vf)
5250 : : - vec_inside_cost);
5251 : : }
5252 : 53708 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5253 : 53708 : if (dump_enabled_p ())
5254 : 586 : dump_printf_loc (MSG_NOTE, vect_location,
5255 : : " Static estimate profitability threshold = %d\n",
5256 : : min_profitable_estimate);
5257 : :
5258 : 53708 : *ret_min_profitable_estimate = min_profitable_estimate;
5259 : : }
5260 : :
5261 : : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5262 : : vector elements (not bits) for a vector with NELT elements. */
5263 : : static void
5264 : 2021 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5265 : : vec_perm_builder *sel)
5266 : : {
5267 : : /* The encoding is a single stepped pattern. Any wrap-around is handled
5268 : : by vec_perm_indices. */
5269 : 2021 : sel->new_vector (nelt, 1, 3);
5270 : 8084 : for (unsigned int i = 0; i < 3; i++)
5271 : 6063 : sel->quick_push (i + offset);
5272 : 2021 : }
5273 : :
5274 : : /* Checks whether the target supports whole-vector shifts for vectors of mode
5275 : : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5276 : : it supports vec_perm_const with masks for all necessary shift amounts. */
5277 : : static bool
5278 : 6883 : have_whole_vector_shift (machine_mode mode)
5279 : : {
5280 : 6883 : if (can_implement_p (vec_shr_optab, mode))
5281 : : return true;
5282 : :
5283 : : /* Variable-length vectors should be handled via the optab. */
5284 : 55 : unsigned int nelt;
5285 : 110 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5286 : : return false;
5287 : :
5288 : 55 : vec_perm_builder sel;
5289 : 55 : vec_perm_indices indices;
5290 : 285 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5291 : : {
5292 : 230 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
5293 : 230 : indices.new_vector (sel, 2, nelt);
5294 : 230 : if (!can_vec_perm_const_p (mode, mode, indices, false))
5295 : : return false;
5296 : : }
5297 : : return true;
5298 : 55 : }
5299 : :
5300 : : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5301 : : multiplication operands have differing signs and (b) we intend
5302 : : to emulate the operation using a series of signed DOT_PROD_EXPRs.
5303 : : See vect_emulate_mixed_dot_prod for the actual sequence used. */
5304 : :
5305 : : static bool
5306 : 1799 : vect_is_emulated_mixed_dot_prod (stmt_vec_info stmt_info)
5307 : : {
5308 : 1799 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5309 : 1531 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5310 : : return false;
5311 : :
5312 : 462 : tree rhs1 = gimple_assign_rhs1 (assign);
5313 : 462 : tree rhs2 = gimple_assign_rhs2 (assign);
5314 : 462 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5315 : : return false;
5316 : :
5317 : 99 : gcc_assert (STMT_VINFO_REDUC_VECTYPE_IN (stmt_info));
5318 : 99 : return !directly_supported_p (DOT_PROD_EXPR,
5319 : : STMT_VINFO_VECTYPE (stmt_info),
5320 : : STMT_VINFO_REDUC_VECTYPE_IN (stmt_info),
5321 : 99 : optab_vector_mixed_sign);
5322 : : }
5323 : :
5324 : : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5325 : : functions. Design better to avoid maintenance issues. */
5326 : :
5327 : : /* Function vect_model_reduction_cost.
5328 : :
5329 : : Models cost for a reduction operation, including the vector ops
5330 : : generated within the strip-mine loop in some cases, the initial
5331 : : definition before the loop, and the epilogue code that must be generated. */
5332 : :
5333 : : static void
5334 : 43675 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
5335 : : stmt_vec_info stmt_info, internal_fn reduc_fn,
5336 : : vect_reduction_type reduction_type,
5337 : : int ncopies, stmt_vector_for_cost *cost_vec)
5338 : : {
5339 : 43675 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5340 : 43675 : tree vectype;
5341 : 43675 : machine_mode mode;
5342 : 43675 : class loop *loop = NULL;
5343 : :
5344 : 43675 : if (loop_vinfo)
5345 : 43675 : loop = LOOP_VINFO_LOOP (loop_vinfo);
5346 : :
5347 : : /* Condition reductions generate two reductions in the loop. */
5348 : 43675 : if (reduction_type == COND_REDUCTION)
5349 : 193 : ncopies *= 2;
5350 : :
5351 : 43675 : vectype = STMT_VINFO_VECTYPE (stmt_info);
5352 : 43675 : mode = TYPE_MODE (vectype);
5353 : 43675 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5354 : :
5355 : 43675 : gimple_match_op op;
5356 : 43675 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5357 : 0 : gcc_unreachable ();
5358 : :
5359 : 43675 : if (reduction_type == EXTRACT_LAST_REDUCTION)
5360 : : /* No extra instructions are needed in the prologue. The loop body
5361 : : operations are costed in vectorizable_condition. */
5362 : : inside_cost = 0;
5363 : 43675 : else if (reduction_type == FOLD_LEFT_REDUCTION)
5364 : : {
5365 : : /* No extra instructions needed in the prologue. */
5366 : 4141 : prologue_cost = 0;
5367 : :
5368 : 4141 : if (reduc_fn != IFN_LAST)
5369 : : /* Count one reduction-like operation per vector. */
5370 : 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5371 : : stmt_info, 0, vect_body);
5372 : : else
5373 : : {
5374 : : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5375 : 4141 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5376 : 4141 : inside_cost = record_stmt_cost (cost_vec, nelements,
5377 : : vec_to_scalar, stmt_info, 0,
5378 : : vect_body);
5379 : 4141 : inside_cost += record_stmt_cost (cost_vec, nelements,
5380 : : scalar_stmt, stmt_info, 0,
5381 : : vect_body);
5382 : : }
5383 : : }
5384 : : else
5385 : : {
5386 : : /* Add in the cost of the initial definitions. */
5387 : 39534 : int prologue_stmts;
5388 : 39534 : if (reduction_type == COND_REDUCTION)
5389 : : /* For cond reductions we have four vectors: initial index, step,
5390 : : initial result of the data reduction, initial value of the index
5391 : : reduction. */
5392 : : prologue_stmts = 4;
5393 : : else
5394 : : /* We need the initial reduction value. */
5395 : 39341 : prologue_stmts = 1;
5396 : 39534 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5397 : : scalar_to_vec, stmt_info, 0,
5398 : : vect_prologue);
5399 : : }
5400 : :
5401 : : /* Determine cost of epilogue code.
5402 : :
5403 : : We have a reduction operator that will reduce the vector in one statement.
5404 : : Also requires scalar extract. */
5405 : :
5406 : 43675 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5407 : : {
5408 : 43526 : if (reduc_fn != IFN_LAST)
5409 : : {
5410 : 30910 : if (reduction_type == COND_REDUCTION)
5411 : : {
5412 : : /* An EQ stmt and an COND_EXPR stmt. */
5413 : 7 : epilogue_cost += record_stmt_cost (cost_vec, 2,
5414 : : vector_stmt, stmt_info, 0,
5415 : : vect_epilogue);
5416 : : /* Reduction of the max index and a reduction of the found
5417 : : values. */
5418 : 7 : epilogue_cost += record_stmt_cost (cost_vec, 2,
5419 : : vec_to_scalar, stmt_info, 0,
5420 : : vect_epilogue);
5421 : : /* A broadcast of the max value. */
5422 : 7 : epilogue_cost += record_stmt_cost (cost_vec, 1,
5423 : : scalar_to_vec, stmt_info, 0,
5424 : : vect_epilogue);
5425 : : }
5426 : : else
5427 : : {
5428 : 30903 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5429 : : stmt_info, 0, vect_epilogue);
5430 : 30903 : epilogue_cost += record_stmt_cost (cost_vec, 1,
5431 : : vec_to_scalar, stmt_info, 0,
5432 : : vect_epilogue);
5433 : : }
5434 : : }
5435 : 12616 : else if (reduction_type == COND_REDUCTION)
5436 : : {
5437 : 186 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5438 : : /* Extraction of scalar elements. */
5439 : 372 : epilogue_cost += record_stmt_cost (cost_vec,
5440 : 186 : 2 * estimated_nunits,
5441 : : vec_to_scalar, stmt_info, 0,
5442 : : vect_epilogue);
5443 : : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5444 : 186 : epilogue_cost += record_stmt_cost (cost_vec,
5445 : 186 : 2 * estimated_nunits - 3,
5446 : : scalar_stmt, stmt_info, 0,
5447 : : vect_epilogue);
5448 : : }
5449 : 12430 : else if (reduction_type == EXTRACT_LAST_REDUCTION
5450 : 12430 : || reduction_type == FOLD_LEFT_REDUCTION)
5451 : : /* No extra instructions need in the epilogue. */
5452 : : ;
5453 : : else
5454 : : {
5455 : 8289 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5456 : 8289 : tree bitsize = TYPE_SIZE (op.type);
5457 : 8289 : int element_bitsize = tree_to_uhwi (bitsize);
5458 : 8289 : int nelements = vec_size_in_bits / element_bitsize;
5459 : :
5460 : 8289 : if (op.code == COND_EXPR)
5461 : 28 : op.code = MAX_EXPR;
5462 : :
5463 : : /* We have a whole vector shift available. */
5464 : 763 : if (VECTOR_MODE_P (mode)
5465 : 8289 : && directly_supported_p (op.code, vectype)
5466 : 13456 : && have_whole_vector_shift (mode))
5467 : : {
5468 : : /* Final reduction via vector shifts and the reduction operator.
5469 : : Also requires scalar extract. */
5470 : 15501 : epilogue_cost += record_stmt_cost (cost_vec,
5471 : 10334 : exact_log2 (nelements) * 2,
5472 : : vector_stmt, stmt_info, 0,
5473 : : vect_epilogue);
5474 : 5167 : epilogue_cost += record_stmt_cost (cost_vec, 1,
5475 : : vec_to_scalar, stmt_info, 0,
5476 : : vect_epilogue);
5477 : : }
5478 : : else
5479 : : /* Use extracts and reduction op for final reduction. For N
5480 : : elements, we have N extracts and N-1 reduction ops. */
5481 : 3122 : epilogue_cost += record_stmt_cost (cost_vec,
5482 : 3122 : nelements + nelements - 1,
5483 : : vector_stmt, stmt_info, 0,
5484 : : vect_epilogue);
5485 : : }
5486 : : }
5487 : :
5488 : 43675 : if (dump_enabled_p ())
5489 : 2429 : dump_printf (MSG_NOTE,
5490 : : "vect_model_reduction_cost: inside_cost = %d, "
5491 : : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5492 : : prologue_cost, epilogue_cost);
5493 : 43675 : }
5494 : :
5495 : : /* SEQ is a sequence of instructions that initialize the reduction
5496 : : described by REDUC_INFO. Emit them in the appropriate place. */
5497 : :
5498 : : static void
5499 : 397 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5500 : : stmt_vec_info reduc_info, gimple *seq)
5501 : : {
5502 : 397 : if (reduc_info->reused_accumulator)
5503 : : {
5504 : : /* When reusing an accumulator from the main loop, we only need
5505 : : initialization instructions if the main loop can be skipped.
5506 : : In that case, emit the initialization instructions at the end
5507 : : of the guard block that does the skip. */
5508 : 21 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5509 : 21 : gcc_assert (skip_edge);
5510 : 21 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5511 : 21 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5512 : : }
5513 : : else
5514 : : {
5515 : : /* The normal case: emit the initialization instructions on the
5516 : : preheader edge. */
5517 : 376 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5518 : 376 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5519 : : }
5520 : 397 : }
5521 : :
5522 : : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5523 : : which performs a reduction involving GROUP_SIZE scalar statements.
5524 : : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5525 : : is nonnull, introducing extra elements of that value will not change the
5526 : : result. */
5527 : :
5528 : : static void
5529 : 20877 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5530 : : stmt_vec_info reduc_info,
5531 : : vec<tree> *vec_oprnds,
5532 : : unsigned int number_of_vectors,
5533 : : unsigned int group_size, tree neutral_op)
5534 : : {
5535 : 20877 : vec<tree> &initial_values = reduc_info->reduc_initial_values;
5536 : 20877 : unsigned HOST_WIDE_INT nunits;
5537 : 20877 : unsigned j, number_of_places_left_in_vector;
5538 : 20877 : tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5539 : 20877 : unsigned int i;
5540 : :
5541 : 41754 : gcc_assert (group_size == initial_values.length () || neutral_op);
5542 : :
5543 : : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5544 : : created vectors. It is greater than 1 if unrolling is performed.
5545 : :
5546 : : For example, we have two scalar operands, s1 and s2 (e.g., group of
5547 : : strided accesses of size two), while NUNITS is four (i.e., four scalars
5548 : : of this type can be packed in a vector). The output vector will contain
5549 : : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5550 : : will be 2).
5551 : :
5552 : : If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5553 : : vectors containing the operands.
5554 : :
5555 : : For example, NUNITS is four as before, and the group size is 8
5556 : : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5557 : : {s5, s6, s7, s8}. */
5558 : :
5559 : 20877 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5560 : : nunits = group_size;
5561 : :
5562 : 20877 : number_of_places_left_in_vector = nunits;
5563 : 20877 : bool constant_p = true;
5564 : 20877 : tree_vector_builder elts (vector_type, nunits, 1);
5565 : 20877 : elts.quick_grow (nunits);
5566 : 20877 : gimple_seq ctor_seq = NULL;
5567 : 20877 : if (neutral_op
5568 : 41667 : && !useless_type_conversion_p (TREE_TYPE (vector_type),
5569 : 20790 : TREE_TYPE (neutral_op)))
5570 : 1 : neutral_op = gimple_convert (&ctor_seq,
5571 : 1 : TREE_TYPE (vector_type),
5572 : : neutral_op);
5573 : 196277 : for (j = 0; j < nunits * number_of_vectors; ++j)
5574 : : {
5575 : 175400 : tree op;
5576 : 175400 : i = j % group_size;
5577 : :
5578 : : /* Get the def before the loop. In reduction chain we have only
5579 : : one initial value. Else we have as many as PHIs in the group. */
5580 : 175400 : if (i >= initial_values.length () || (j > i && neutral_op))
5581 : : op = neutral_op;
5582 : : else
5583 : : {
5584 : 43094 : if (!useless_type_conversion_p (TREE_TYPE (vector_type),
5585 : 21547 : TREE_TYPE (initial_values[i])))
5586 : 6 : initial_values[i] = gimple_convert (&ctor_seq,
5587 : 3 : TREE_TYPE (vector_type),
5588 : 3 : initial_values[i]);
5589 : 21547 : op = initial_values[i];
5590 : : }
5591 : :
5592 : : /* Create 'vect_ = {op0,op1,...,opn}'. */
5593 : 175400 : number_of_places_left_in_vector--;
5594 : 175400 : elts[nunits - number_of_places_left_in_vector - 1] = op;
5595 : 175400 : if (!CONSTANT_CLASS_P (op))
5596 : 2199 : constant_p = false;
5597 : :
5598 : 175400 : if (number_of_places_left_in_vector == 0)
5599 : : {
5600 : 21429 : tree init;
5601 : 42858 : if (constant_p && !neutral_op
5602 : 42807 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5603 : 21429 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5604 : : /* Build the vector directly from ELTS. */
5605 : 21429 : init = gimple_build_vector (&ctor_seq, &elts);
5606 : 0 : else if (neutral_op)
5607 : : {
5608 : : /* Build a vector of the neutral value and shift the
5609 : : other elements into place. */
5610 : 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5611 : : neutral_op);
5612 : 0 : int k = nunits;
5613 : 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
5614 : : k -= 1;
5615 : 0 : while (k > 0)
5616 : : {
5617 : 0 : k -= 1;
5618 : 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5619 : 0 : vector_type, init, elts[k]);
5620 : : }
5621 : : }
5622 : : else
5623 : : {
5624 : : /* First time round, duplicate ELTS to fill the
5625 : : required number of vectors. */
5626 : 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5627 : : elts, number_of_vectors, *vec_oprnds);
5628 : 0 : break;
5629 : : }
5630 : 21429 : vec_oprnds->quick_push (init);
5631 : :
5632 : 21429 : number_of_places_left_in_vector = nunits;
5633 : 21429 : elts.new_vector (vector_type, nunits, 1);
5634 : 21429 : elts.quick_grow (nunits);
5635 : 21429 : constant_p = true;
5636 : : }
5637 : : }
5638 : 20877 : if (ctor_seq != NULL)
5639 : 397 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5640 : 20877 : }
5641 : :
5642 : : /* For a statement STMT_INFO taking part in a reduction operation return
5643 : : the stmt_vec_info the meta information is stored on. */
5644 : :
5645 : : stmt_vec_info
5646 : 124379 : info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5647 : : {
5648 : 124379 : stmt_info = vect_orig_stmt (stmt_info);
5649 : 124379 : gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5650 : 124379 : if (!is_a <gphi *> (stmt_info->stmt)
5651 : 124379 : || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5652 : 53284 : stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5653 : 124379 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
5654 : 124379 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5655 : : {
5656 : 580 : if (gimple_phi_num_args (phi) == 1)
5657 : 241 : stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5658 : : }
5659 : 123799 : else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5660 : : {
5661 : 2586 : stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5662 : 2586 : if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5663 : 124379 : stmt_info = info;
5664 : : }
5665 : 124379 : return stmt_info;
5666 : : }
5667 : :
5668 : : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5669 : : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5670 : : return false. */
5671 : :
5672 : : static bool
5673 : 20885 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5674 : : stmt_vec_info reduc_info)
5675 : : {
5676 : 20885 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5677 : 20885 : if (!main_loop_vinfo)
5678 : : return false;
5679 : :
5680 : 4723 : if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5681 : : return false;
5682 : :
5683 : 4705 : unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5684 : 4705 : auto_vec<tree, 16> main_loop_results (num_phis);
5685 : 4705 : auto_vec<tree, 16> initial_values (num_phis);
5686 : 4705 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5687 : : {
5688 : : /* The epilogue loop can be entered either from the main loop or
5689 : : from an earlier guard block. */
5690 : 4524 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5691 : 18116 : for (tree incoming_value : reduc_info->reduc_initial_values)
5692 : : {
5693 : : /* Look for:
5694 : :
5695 : : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5696 : : INITIAL_VALUE(guard block)>. */
5697 : 4544 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5698 : :
5699 : 4544 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5700 : 4544 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5701 : :
5702 : 4544 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5703 : 4544 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5704 : :
5705 : 4544 : main_loop_results.quick_push (from_main_loop);
5706 : 4544 : initial_values.quick_push (from_skip);
5707 : : }
5708 : : }
5709 : : else
5710 : : /* The main loop dominates the epilogue loop. */
5711 : 181 : main_loop_results.splice (reduc_info->reduc_initial_values);
5712 : :
5713 : : /* See if the main loop has the kind of accumulator we need. */
5714 : 4705 : vect_reusable_accumulator *accumulator
5715 : 4705 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5716 : 4705 : if (!accumulator
5717 : 9396 : || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5718 : 14101 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5719 : : accumulator->reduc_info->reduc_scalar_results.begin ()))
5720 : : return false;
5721 : :
5722 : : /* Handle the case where we can reduce wider vectors to narrower ones. */
5723 : 4692 : tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5724 : 4692 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5725 : 4692 : unsigned HOST_WIDE_INT m;
5726 : 4692 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5727 : 4692 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5728 : 0 : return false;
5729 : : /* Check the intermediate vector types and operations are available. */
5730 : 4692 : tree prev_vectype = old_vectype;
5731 : 4692 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5732 : 13556 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5733 : : {
5734 : 4692 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5735 : 4692 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5736 : 4692 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5737 : 4692 : if (!intermediate_vectype
5738 : 4692 : || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5739 : : intermediate_vectype)
5740 : 8864 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5741 : 4172 : TYPE_MODE (intermediate_vectype)))
5742 : : return false;
5743 : : prev_vectype = intermediate_vectype;
5744 : : }
5745 : :
5746 : : /* Non-SLP reductions might apply an adjustment after the reduction
5747 : : operation, in order to simplify the initialization of the accumulator.
5748 : : If the epilogue loop carries on from where the main loop left off,
5749 : : it should apply the same adjustment to the final reduction result.
5750 : :
5751 : : If the epilogue loop can also be entered directly (rather than via
5752 : : the main loop), we need to be able to handle that case in the same way,
5753 : : with the same adjustment. (In principle we could add a PHI node
5754 : : to select the correct adjustment, but in practice that shouldn't be
5755 : : necessary.) */
5756 : 4172 : tree main_adjustment
5757 : 4172 : = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5758 : 4172 : if (loop_vinfo->main_loop_edge && main_adjustment)
5759 : : {
5760 : 3604 : gcc_assert (num_phis == 1);
5761 : 3604 : tree initial_value = initial_values[0];
5762 : : /* Check that we can use INITIAL_VALUE as the adjustment and
5763 : : initialize the accumulator with a neutral value instead. */
5764 : 3604 : if (!operand_equal_p (initial_value, main_adjustment))
5765 : 106 : return false;
5766 : 3498 : code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5767 : 3498 : initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5768 : : code, initial_value);
5769 : : }
5770 : 4066 : STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5771 : 4066 : reduc_info->reduc_initial_values.truncate (0);
5772 : 4066 : reduc_info->reduc_initial_values.splice (initial_values);
5773 : 4066 : reduc_info->reused_accumulator = accumulator;
5774 : 4066 : return true;
5775 : 4705 : }
5776 : :
5777 : : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5778 : : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5779 : :
5780 : : static tree
5781 : 5780 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5782 : : gimple_seq *seq)
5783 : : {
5784 : 5780 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5785 : 5780 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5786 : 5780 : tree stype = TREE_TYPE (vectype);
5787 : 5780 : tree new_temp = vec_def;
5788 : 9883 : while (nunits > nunits1)
5789 : : {
5790 : 4103 : nunits /= 2;
5791 : 4103 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5792 : 4103 : stype, nunits);
5793 : 4103 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5794 : :
5795 : : /* The target has to make sure we support lowpart/highpart
5796 : : extraction, either via direct vector extract or through
5797 : : an integer mode punning. */
5798 : 4103 : tree dst1, dst2;
5799 : 4103 : gimple *epilog_stmt;
5800 : 4103 : if (convert_optab_handler (vec_extract_optab,
5801 : 4103 : TYPE_MODE (TREE_TYPE (new_temp)),
5802 : 4103 : TYPE_MODE (vectype1))
5803 : : != CODE_FOR_nothing)
5804 : : {
5805 : : /* Extract sub-vectors directly once vec_extract becomes
5806 : : a conversion optab. */
5807 : 2670 : dst1 = make_ssa_name (vectype1);
5808 : 2670 : epilog_stmt
5809 : 5340 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5810 : : build3 (BIT_FIELD_REF, vectype1,
5811 : 2670 : new_temp, TYPE_SIZE (vectype1),
5812 : : bitsize_int (0)));
5813 : 2670 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5814 : 2670 : dst2 = make_ssa_name (vectype1);
5815 : 2670 : epilog_stmt
5816 : 2670 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5817 : : build3 (BIT_FIELD_REF, vectype1,
5818 : 2670 : new_temp, TYPE_SIZE (vectype1),
5819 : 2670 : bitsize_int (bitsize)));
5820 : 2670 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5821 : : }
5822 : : else
5823 : : {
5824 : : /* Extract via punning to appropriately sized integer mode
5825 : : vector. */
5826 : 1433 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5827 : 1433 : tree etype = build_vector_type (eltype, 2);
5828 : 2866 : gcc_assert (convert_optab_handler (vec_extract_optab,
5829 : : TYPE_MODE (etype),
5830 : : TYPE_MODE (eltype))
5831 : : != CODE_FOR_nothing);
5832 : 1433 : tree tem = make_ssa_name (etype);
5833 : 1433 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5834 : : build1 (VIEW_CONVERT_EXPR,
5835 : : etype, new_temp));
5836 : 1433 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5837 : 1433 : new_temp = tem;
5838 : 1433 : tem = make_ssa_name (eltype);
5839 : 1433 : epilog_stmt
5840 : 2866 : = gimple_build_assign (tem, BIT_FIELD_REF,
5841 : : build3 (BIT_FIELD_REF, eltype,
5842 : 1433 : new_temp, TYPE_SIZE (eltype),
5843 : : bitsize_int (0)));
5844 : 1433 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5845 : 1433 : dst1 = make_ssa_name (vectype1);
5846 : 1433 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5847 : : build1 (VIEW_CONVERT_EXPR,
5848 : : vectype1, tem));
5849 : 1433 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5850 : 1433 : tem = make_ssa_name (eltype);
5851 : 1433 : epilog_stmt
5852 : 1433 : = gimple_build_assign (tem, BIT_FIELD_REF,
5853 : : build3 (BIT_FIELD_REF, eltype,
5854 : 1433 : new_temp, TYPE_SIZE (eltype),
5855 : 1433 : bitsize_int (bitsize)));
5856 : 1433 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5857 : 1433 : dst2 = make_ssa_name (vectype1);
5858 : 1433 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5859 : : build1 (VIEW_CONVERT_EXPR,
5860 : : vectype1, tem));
5861 : 1433 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5862 : : }
5863 : :
5864 : 4103 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5865 : : }
5866 : :
5867 : 5780 : return new_temp;
5868 : : }
5869 : :
5870 : : /* Function vect_create_epilog_for_reduction
5871 : :
5872 : : Create code at the loop-epilog to finalize the result of a reduction
5873 : : computation.
5874 : :
5875 : : STMT_INFO is the scalar reduction stmt that is being vectorized.
5876 : : SLP_NODE is an SLP node containing a group of reduction statements. The
5877 : : first one in this group is STMT_INFO.
5878 : : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5879 : : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5880 : : (counting from 0)
5881 : : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5882 : : exit this edge is always the main loop exit.
5883 : :
5884 : : This function:
5885 : : 1. Completes the reduction def-use cycles.
5886 : : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5887 : : by calling the function specified by REDUC_FN if available, or by
5888 : : other means (whole-vector shifts or a scalar loop).
5889 : : The function also creates a new phi node at the loop exit to preserve
5890 : : loop-closed form, as illustrated below.
5891 : :
5892 : : The flow at the entry to this function:
5893 : :
5894 : : loop:
5895 : : vec_def = phi <vec_init, null> # REDUCTION_PHI
5896 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5897 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5898 : : loop_exit:
5899 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5900 : : use <s_out0>
5901 : : use <s_out0>
5902 : :
5903 : : The above is transformed by this function into:
5904 : :
5905 : : loop:
5906 : : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5907 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5908 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5909 : : loop_exit:
5910 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5911 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5912 : : v_out2 = reduce <v_out1>
5913 : : s_out3 = extract_field <v_out2, 0>
5914 : : s_out4 = adjust_result <s_out3>
5915 : : use <s_out4>
5916 : : use <s_out4>
5917 : : */
5918 : :
5919 : : static void
5920 : 21196 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5921 : : stmt_vec_info stmt_info,
5922 : : slp_tree slp_node,
5923 : : slp_instance slp_node_instance,
5924 : : edge loop_exit)
5925 : : {
5926 : 21196 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5927 : 21196 : gcc_assert (reduc_info->is_reduc_info);
5928 : : /* For double reductions we need to get at the inner loop reduction
5929 : : stmt which has the meta info attached. Our stmt_info is that of the
5930 : : loop-closed PHI of the inner loop which we remember as
5931 : : def for the reduction PHI generation. */
5932 : 21196 : bool double_reduc = false;
5933 : 21196 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5934 : : {
5935 : 66 : double_reduc = true;
5936 : 66 : stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5937 : 66 : (stmt_info->stmt, 0));
5938 : 66 : stmt_info = vect_stmt_to_vectorize (stmt_info);
5939 : : }
5940 : 21196 : code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5941 : 21196 : internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5942 : 21196 : tree vectype;
5943 : 21196 : machine_mode mode;
5944 : 21196 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5945 : 21196 : basic_block exit_bb;
5946 : 21196 : tree scalar_dest;
5947 : 21196 : tree scalar_type;
5948 : 21196 : gimple *new_phi = NULL, *phi = NULL;
5949 : 21196 : gimple_stmt_iterator exit_gsi;
5950 : 21196 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5951 : 21196 : gimple *epilog_stmt = NULL;
5952 : 21196 : gimple *exit_phi;
5953 : 21196 : tree bitsize;
5954 : 21196 : tree def;
5955 : 21196 : tree orig_name, scalar_result;
5956 : 21196 : imm_use_iterator imm_iter, phi_imm_iter;
5957 : 21196 : use_operand_p use_p, phi_use_p;
5958 : 21196 : gimple *use_stmt;
5959 : 21196 : auto_vec<tree> reduc_inputs;
5960 : 21196 : int j, i;
5961 : 21196 : vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5962 : 21196 : unsigned int k;
5963 : : /* SLP reduction without reduction chain, e.g.,
5964 : : # a1 = phi <a2, a0>
5965 : : # b1 = phi <b2, b0>
5966 : : a2 = operation (a1)
5967 : : b2 = operation (b1) */
5968 : 21196 : bool slp_reduc
5969 : 21196 : = !REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (reduc_info));
5970 : 21196 : bool direct_slp_reduc;
5971 : 21196 : tree induction_index = NULL_TREE;
5972 : :
5973 : 21196 : unsigned int group_size = SLP_TREE_LANES (slp_node);
5974 : :
5975 : 21196 : if (nested_in_vect_loop_p (loop, stmt_info))
5976 : : {
5977 : 66 : outer_loop = loop;
5978 : 66 : loop = loop->inner;
5979 : 66 : gcc_assert (double_reduc);
5980 : : }
5981 : :
5982 : 21196 : vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5983 : 21196 : gcc_assert (vectype);
5984 : 21196 : mode = TYPE_MODE (vectype);
5985 : :
5986 : 21196 : tree induc_val = NULL_TREE;
5987 : 21196 : tree adjustment_def = NULL;
5988 : : /* Optimize: for induction condition reduction, if we can't use zero
5989 : : for induc_val, use initial_def. */
5990 : 21196 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5991 : 66 : induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5992 : 21130 : else if (double_reduc)
5993 : : ;
5994 : : else
5995 : 21064 : adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5996 : :
5997 : 21196 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
5998 : 21196 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5999 : 21196 : if (slp_reduc)
6000 : : /* All statements produce live-out values. */
6001 : 42058 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6002 : :
6003 : 21196 : unsigned vec_num
6004 : 21196 : = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6005 : :
6006 : : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6007 : : which is updated with the current index of the loop for every match of
6008 : : the original loop's cond_expr (VEC_STMT). This results in a vector
6009 : : containing the last time the condition passed for that vector lane.
6010 : : The first match will be a 1 to allow 0 to be used for non-matching
6011 : : indexes. If there are no matches at all then the vector will be all
6012 : : zeroes.
6013 : :
6014 : : PR92772: This algorithm is broken for architectures that support
6015 : : masked vectors, but do not provide fold_extract_last. */
6016 : 21196 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6017 : : {
6018 : 73 : auto_vec<std::pair<tree, bool>, 2> ccompares;
6019 : 73 : slp_tree cond_node = slp_node_instance->root;
6020 : 167 : while (cond_node != slp_node_instance->reduc_phis)
6021 : : {
6022 : 94 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
6023 : 94 : int slp_reduc_idx;
6024 : 94 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6025 : : {
6026 : 82 : gimple *vec_stmt
6027 : 82 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
6028 : 82 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6029 : 82 : ccompares.safe_push
6030 : 82 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
6031 : 82 : STMT_VINFO_REDUC_IDX (cond_info) == 2));
6032 : : /* ??? We probably want to have REDUC_IDX on the SLP node?
6033 : : We have both three and four children COND_EXPR nodes
6034 : : dependent on whether the comparison is still embedded
6035 : : as GENERIC. So work backwards. */
6036 : 82 : slp_reduc_idx = (SLP_TREE_CHILDREN (cond_node).length () - 3
6037 : 82 : + STMT_VINFO_REDUC_IDX (cond_info));
6038 : : }
6039 : : else
6040 : 12 : slp_reduc_idx = STMT_VINFO_REDUC_IDX (cond_info);
6041 : 94 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
6042 : : }
6043 : 73 : gcc_assert (ccompares.length () != 0);
6044 : :
6045 : 73 : tree indx_before_incr, indx_after_incr;
6046 : 73 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6047 : 73 : int scalar_precision
6048 : 73 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6049 : 73 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6050 : 73 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
6051 : 73 : (TYPE_MODE (vectype), cr_index_scalar_type,
6052 : : TYPE_VECTOR_SUBPARTS (vectype));
6053 : :
6054 : : /* First we create a simple vector induction variable which starts
6055 : : with the values {1,2,3,...} (SERIES_VECT) and increments by the
6056 : : vector size (STEP). */
6057 : :
6058 : : /* Create a {1,2,3,...} vector. */
6059 : 73 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6060 : :
6061 : : /* Create a vector of the step value. */
6062 : 73 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6063 : 73 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6064 : :
6065 : : /* Create an induction variable. */
6066 : 73 : gimple_stmt_iterator incr_gsi;
6067 : 73 : bool insert_after;
6068 : 73 : vect_iv_increment_position (LOOP_VINFO_IV_EXIT (loop_vinfo),
6069 : : &incr_gsi, &insert_after);
6070 : 73 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6071 : : insert_after, &indx_before_incr, &indx_after_incr);
6072 : :
6073 : : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6074 : : filled with zeros (VEC_ZERO). */
6075 : :
6076 : : /* Create a vector of 0s. */
6077 : 73 : tree zero = build_zero_cst (cr_index_scalar_type);
6078 : 73 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6079 : :
6080 : : /* Create a vector phi node. */
6081 : 73 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6082 : 73 : new_phi = create_phi_node (new_phi_tree, loop->header);
6083 : 73 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6084 : : loop_preheader_edge (loop), UNKNOWN_LOCATION);
6085 : :
6086 : : /* Now take the condition from the loops original cond_exprs
6087 : : and produce a new cond_exprs (INDEX_COND_EXPR) which for
6088 : : every match uses values from the induction variable
6089 : : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6090 : : (NEW_PHI_TREE).
6091 : : Finally, we update the phi (NEW_PHI_TREE) to take the value of
6092 : : the new cond_expr (INDEX_COND_EXPR). */
6093 : 73 : gimple_seq stmts = NULL;
6094 : 228 : for (int i = ccompares.length () - 1; i != -1; --i)
6095 : : {
6096 : 82 : tree ccompare = ccompares[i].first;
6097 : 82 : if (ccompares[i].second)
6098 : 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6099 : : cr_index_vector_type,
6100 : : ccompare,
6101 : : indx_before_incr, new_phi_tree);
6102 : : else
6103 : 13 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6104 : : cr_index_vector_type,
6105 : : ccompare,
6106 : : new_phi_tree, indx_before_incr);
6107 : : }
6108 : 73 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6109 : :
6110 : : /* Update the phi with the vec cond. */
6111 : 73 : induction_index = new_phi_tree;
6112 : 73 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6113 : : loop_latch_edge (loop), UNKNOWN_LOCATION);
6114 : 73 : }
6115 : :
6116 : : /* 2. Create epilog code.
6117 : : The reduction epilog code operates across the elements of the vector
6118 : : of partial results computed by the vectorized loop.
6119 : : The reduction epilog code consists of:
6120 : :
6121 : : step 1: compute the scalar result in a vector (v_out2)
6122 : : step 2: extract the scalar result (s_out3) from the vector (v_out2)
6123 : : step 3: adjust the scalar result (s_out3) if needed.
6124 : :
6125 : : Step 1 can be accomplished using one the following three schemes:
6126 : : (scheme 1) using reduc_fn, if available.
6127 : : (scheme 2) using whole-vector shifts, if available.
6128 : : (scheme 3) using a scalar loop. In this case steps 1+2 above are
6129 : : combined.
6130 : :
6131 : : The overall epilog code looks like this:
6132 : :
6133 : : s_out0 = phi <s_loop> # original EXIT_PHI
6134 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6135 : : v_out2 = reduce <v_out1> # step 1
6136 : : s_out3 = extract_field <v_out2, 0> # step 2
6137 : : s_out4 = adjust_result <s_out3> # step 3
6138 : :
6139 : : (step 3 is optional, and steps 1 and 2 may be combined).
6140 : : Lastly, the uses of s_out0 are replaced by s_out4. */
6141 : :
6142 : :
6143 : : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6144 : : v_out1 = phi <VECT_DEF>
6145 : : Store them in NEW_PHIS. */
6146 : 21196 : if (double_reduc)
6147 : 66 : loop = outer_loop;
6148 : : /* We need to reduce values in all exits. */
6149 : 21196 : exit_bb = loop_exit->dest;
6150 : 21196 : exit_gsi = gsi_after_labels (exit_bb);
6151 : 21196 : reduc_inputs.create (vec_num);
6152 : 42948 : for (unsigned i = 0; i < vec_num; i++)
6153 : : {
6154 : 21752 : gimple_seq stmts = NULL;
6155 : 21752 : def = vect_get_slp_vect_def (slp_node, i);
6156 : 21752 : tree new_def = copy_ssa_name (def);
6157 : 21752 : phi = create_phi_node (new_def, exit_bb);
6158 : 21752 : if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6159 : 21725 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6160 : : else
6161 : : {
6162 : 57 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6163 : 30 : SET_PHI_ARG_DEF (phi, k, def);
6164 : : }
6165 : 21752 : new_def = gimple_convert (&stmts, vectype, new_def);
6166 : 21752 : reduc_inputs.quick_push (new_def);
6167 : 21752 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6168 : : }
6169 : :
6170 : : /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6171 : : (i.e. when reduc_fn is not available) and in the final adjustment
6172 : : code (if needed). Also get the original scalar reduction variable as
6173 : : defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6174 : : represents a reduction pattern), the tree-code and scalar-def are
6175 : : taken from the original stmt that the pattern-stmt (STMT) replaces.
6176 : : Otherwise (it is a regular reduction) - the tree-code and scalar-def
6177 : : are taken from STMT. */
6178 : :
6179 : 21196 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6180 : 21196 : if (orig_stmt_info != stmt_info)
6181 : : {
6182 : : /* Reduction pattern */
6183 : 609 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6184 : 609 : gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6185 : : }
6186 : :
6187 : 21196 : scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6188 : 21196 : scalar_type = TREE_TYPE (scalar_dest);
6189 : 21196 : scalar_results.truncate (0);
6190 : 21196 : scalar_results.reserve_exact (group_size);
6191 : 21196 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6192 : 21196 : bitsize = TYPE_SIZE (scalar_type);
6193 : :
6194 : : /* True if we should implement SLP_REDUC using native reduction operations
6195 : : instead of scalar operations. */
6196 : 42392 : direct_slp_reduc = (reduc_fn != IFN_LAST
6197 : 21196 : && slp_reduc
6198 : 21196 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6199 : :
6200 : : /* In case of reduction chain, e.g.,
6201 : : # a1 = phi <a3, a0>
6202 : : a2 = operation (a1)
6203 : : a3 = operation (a2),
6204 : :
6205 : : we may end up with more than one vector result. Here we reduce them
6206 : : to one vector.
6207 : :
6208 : : The same is true for a SLP reduction, e.g.,
6209 : : # a1 = phi <a2, a0>
6210 : : # b1 = phi <b2, b0>
6211 : : a2 = operation (a1)
6212 : : b2 = operation (a2),
6213 : :
6214 : : where we can end up with more than one vector as well. We can
6215 : : easily accumulate vectors when the number of vector elements is
6216 : : a multiple of the SLP group size.
6217 : :
6218 : : The same is true if we couldn't use a single defuse cycle. */
6219 : 21196 : if (REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (reduc_info))
6220 : : || direct_slp_reduc
6221 : 21196 : || (slp_reduc
6222 : 21029 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
6223 : : {
6224 : 21170 : gimple_seq stmts = NULL;
6225 : 21170 : tree single_input = reduc_inputs[0];
6226 : 21681 : for (k = 1; k < reduc_inputs.length (); k++)
6227 : 1022 : single_input = gimple_build (&stmts, code, vectype,
6228 : 511 : single_input, reduc_inputs[k]);
6229 : 21170 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6230 : :
6231 : 21170 : reduc_inputs.truncate (0);
6232 : 21170 : reduc_inputs.safe_push (single_input);
6233 : : }
6234 : :
6235 : 21196 : tree orig_reduc_input = reduc_inputs[0];
6236 : :
6237 : : /* If this loop is an epilogue loop that can be skipped after the
6238 : : main loop, we can only share a reduction operation between the
6239 : : main loop and the epilogue if we put it at the target of the
6240 : : skip edge.
6241 : :
6242 : : We can still reuse accumulators if this check fails. Doing so has
6243 : : the minor(?) benefit of making the epilogue loop's scalar result
6244 : : independent of the main loop's scalar result. */
6245 : 21196 : bool unify_with_main_loop_p = false;
6246 : 21196 : if (reduc_info->reused_accumulator
6247 : 4066 : && loop_vinfo->skip_this_loop_edge
6248 : 3872 : && single_succ_p (exit_bb)
6249 : 21211 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6250 : : {
6251 : 15 : unify_with_main_loop_p = true;
6252 : :
6253 : 15 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6254 : 15 : reduc_inputs[0] = make_ssa_name (vectype);
6255 : 15 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6256 : 15 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6257 : : UNKNOWN_LOCATION);
6258 : 15 : add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6259 : : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6260 : 15 : exit_gsi = gsi_after_labels (reduc_block);
6261 : : }
6262 : :
6263 : : /* Shouldn't be used beyond this point. */
6264 : 21196 : exit_bb = nullptr;
6265 : :
6266 : 21196 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6267 : 73 : && reduc_fn != IFN_LAST)
6268 : : {
6269 : : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6270 : : various data values where the condition matched and another vector
6271 : : (INDUCTION_INDEX) containing all the indexes of those matches. We
6272 : : need to extract the last matching index (which will be the index with
6273 : : highest value) and use this to index into the data vector.
6274 : : For the case where there were no matches, the data vector will contain
6275 : : all default values and the index vector will be all zeros. */
6276 : :
6277 : : /* Get various versions of the type of the vector of indexes. */
6278 : 4 : tree index_vec_type = TREE_TYPE (induction_index);
6279 : 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6280 : 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
6281 : 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
6282 : :
6283 : : /* Get an unsigned integer version of the type of the data vector. */
6284 : 4 : int scalar_precision
6285 : 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6286 : 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6287 : 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6288 : : vectype);
6289 : :
6290 : : /* First we need to create a vector (ZERO_VEC) of zeros and another
6291 : : vector (MAX_INDEX_VEC) filled with the last matching index, which we
6292 : : can create using a MAX reduction and then expanding.
6293 : : In the case where the loop never made any matches, the max index will
6294 : : be zero. */
6295 : :
6296 : : /* Vector of {0, 0, 0,...}. */
6297 : 4 : tree zero_vec = build_zero_cst (vectype);
6298 : :
6299 : : /* Find maximum value from the vector of found indexes. */
6300 : 4 : tree max_index = make_ssa_name (index_scalar_type);
6301 : 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6302 : : 1, induction_index);
6303 : 4 : gimple_call_set_lhs (max_index_stmt, max_index);
6304 : 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6305 : :
6306 : : /* Vector of {max_index, max_index, max_index,...}. */
6307 : 4 : tree max_index_vec = make_ssa_name (index_vec_type);
6308 : 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6309 : : max_index);
6310 : 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6311 : : max_index_vec_rhs);
6312 : 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6313 : :
6314 : : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6315 : : with the vector (INDUCTION_INDEX) of found indexes, choosing values
6316 : : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6317 : : otherwise. Only one value should match, resulting in a vector
6318 : : (VEC_COND) with one data value and the rest zeros.
6319 : : In the case where the loop never made any matches, every index will
6320 : : match, resulting in a vector with all data values (which will all be
6321 : : the default value). */
6322 : :
6323 : : /* Compare the max index vector to the vector of found indexes to find
6324 : : the position of the max value. */
6325 : 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
6326 : 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6327 : : induction_index,
6328 : : max_index_vec);
6329 : 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6330 : :
6331 : : /* Use the compare to choose either values from the data vector or
6332 : : zero. */
6333 : 4 : tree vec_cond = make_ssa_name (vectype);
6334 : 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6335 : : vec_compare,
6336 : 4 : reduc_inputs[0],
6337 : : zero_vec);
6338 : 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6339 : :
6340 : : /* Finally we need to extract the data value from the vector (VEC_COND)
6341 : : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6342 : : reduction, but because this doesn't exist, we can use a MAX reduction
6343 : : instead. The data value might be signed or a float so we need to cast
6344 : : it first.
6345 : : In the case where the loop never made any matches, the data values are
6346 : : all identical, and so will reduce down correctly. */
6347 : :
6348 : : /* Make the matched data values unsigned. */
6349 : 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6350 : 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6351 : : vec_cond);
6352 : 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6353 : : VIEW_CONVERT_EXPR,
6354 : : vec_cond_cast_rhs);
6355 : 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6356 : :
6357 : : /* Reduce down to a scalar value. */
6358 : 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
6359 : 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6360 : : 1, vec_cond_cast);
6361 : 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6362 : 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6363 : :
6364 : : /* Convert the reduced value back to the result type and set as the
6365 : : result. */
6366 : 4 : gimple_seq stmts = NULL;
6367 : 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6368 : : data_reduc);
6369 : 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6370 : 4 : scalar_results.safe_push (new_temp);
6371 : 4 : }
6372 : 21192 : else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6373 : 69 : && reduc_fn == IFN_LAST)
6374 : : {
6375 : : /* Condition reduction without supported IFN_REDUC_MAX. Generate
6376 : : idx = 0;
6377 : : idx_val = induction_index[0];
6378 : : val = data_reduc[0];
6379 : : for (idx = 0, val = init, i = 0; i < nelts; ++i)
6380 : : if (induction_index[i] > idx_val)
6381 : : val = data_reduc[i], idx_val = induction_index[i];
6382 : : return val; */
6383 : :
6384 : 69 : tree data_eltype = TREE_TYPE (vectype);
6385 : 69 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6386 : 69 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6387 : 69 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6388 : : /* Enforced by vectorizable_reduction, which ensures we have target
6389 : : support before allowing a conditional reduction on variable-length
6390 : : vectors. */
6391 : 69 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6392 : 69 : tree idx_val = NULL_TREE, val = NULL_TREE;
6393 : 461 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6394 : : {
6395 : 392 : tree old_idx_val = idx_val;
6396 : 392 : tree old_val = val;
6397 : 392 : idx_val = make_ssa_name (idx_eltype);
6398 : 392 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6399 : : build3 (BIT_FIELD_REF, idx_eltype,
6400 : : induction_index,
6401 : 392 : bitsize_int (el_size),
6402 : 392 : bitsize_int (off)));
6403 : 392 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6404 : 392 : val = make_ssa_name (data_eltype);
6405 : 784 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6406 : : build3 (BIT_FIELD_REF,
6407 : : data_eltype,
6408 : 392 : reduc_inputs[0],
6409 : 392 : bitsize_int (el_size),
6410 : 392 : bitsize_int (off)));
6411 : 392 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6412 : 392 : if (off != 0)
6413 : : {
6414 : 323 : tree new_idx_val = idx_val;
6415 : 323 : if (off != v_size - el_size)
6416 : : {
6417 : 254 : new_idx_val = make_ssa_name (idx_eltype);
6418 : 254 : epilog_stmt = gimple_build_assign (new_idx_val,
6419 : : MAX_EXPR, idx_val,
6420 : : old_idx_val);
6421 : 254 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6422 : : }
6423 : 323 : tree cond = make_ssa_name (boolean_type_node);
6424 : 323 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6425 : : idx_val, old_idx_val);
6426 : 323 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6427 : 323 : tree new_val = make_ssa_name (data_eltype);
6428 : 323 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6429 : : cond, val, old_val);
6430 : 323 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6431 : 323 : idx_val = new_idx_val;
6432 : 323 : val = new_val;
6433 : : }
6434 : : }
6435 : : /* Convert the reduced value back to the result type and set as the
6436 : : result. */
6437 : 69 : gimple_seq stmts = NULL;
6438 : 69 : val = gimple_convert (&stmts, scalar_type, val);
6439 : 69 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6440 : 69 : scalar_results.safe_push (val);
6441 : 69 : }
6442 : :
6443 : : /* 2.3 Create the reduction code, using one of the three schemes described
6444 : : above. In SLP we simply need to extract all the elements from the
6445 : : vector (without reducing them), so we use scalar shifts. */
6446 : 21123 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
6447 : : {
6448 : 19407 : tree tmp;
6449 : 19407 : tree vec_elem_type;
6450 : :
6451 : : /* Case 1: Create:
6452 : : v_out2 = reduc_expr <v_out1> */
6453 : :
6454 : 19407 : if (dump_enabled_p ())
6455 : 1272 : dump_printf_loc (MSG_NOTE, vect_location,
6456 : : "Reduce using direct vector reduction.\n");
6457 : :
6458 : 19407 : gimple_seq stmts = NULL;
6459 : 19407 : vec_elem_type = TREE_TYPE (vectype);
6460 : 19407 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6461 : 19407 : vec_elem_type, reduc_inputs[0]);
6462 : 19407 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6463 : 19407 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6464 : :
6465 : 19407 : if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6466 : 66 : && induc_val)
6467 : : {
6468 : : /* Earlier we set the initial value to be a vector if induc_val
6469 : : values. Check the result and if it is induc_val then replace
6470 : : with the original initial value, unless induc_val is
6471 : : the same as initial_def already. */
6472 : 63 : tree zcompare = make_ssa_name (boolean_type_node);
6473 : 63 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6474 : : new_temp, induc_val);
6475 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6476 : 63 : tree initial_def = reduc_info->reduc_initial_values[0];
6477 : 63 : tmp = make_ssa_name (new_scalar_dest);
6478 : 63 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6479 : : initial_def, new_temp);
6480 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6481 : 63 : new_temp = tmp;
6482 : : }
6483 : :
6484 : 19407 : scalar_results.safe_push (new_temp);
6485 : 19407 : }
6486 : 1562 : else if (direct_slp_reduc)
6487 : : {
6488 : : /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6489 : : with the elements for other SLP statements replaced with the
6490 : : neutral value. We can then do a normal reduction on each vector. */
6491 : :
6492 : : /* Enforced by vectorizable_reduction. */
6493 : : gcc_assert (reduc_inputs.length () == 1);
6494 : : gcc_assert (pow2p_hwi (group_size));
6495 : :
6496 : : gimple_seq seq = NULL;
6497 : :
6498 : : /* Build a vector {0, 1, 2, ...}, with the same number of elements
6499 : : and the same element size as VECTYPE. */
6500 : : tree index = build_index_vector (vectype, 0, 1);
6501 : : tree index_type = TREE_TYPE (index);
6502 : : tree index_elt_type = TREE_TYPE (index_type);
6503 : : tree mask_type = truth_type_for (index_type);
6504 : :
6505 : : /* Create a vector that, for each element, identifies which of
6506 : : the REDUC_GROUP_SIZE results should use it. */
6507 : : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6508 : : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6509 : : build_vector_from_val (index_type, index_mask));
6510 : :
6511 : : /* Get a neutral vector value. This is simply a splat of the neutral
6512 : : scalar value if we have one, otherwise the initial scalar value
6513 : : is itself a neutral value. */
6514 : : tree vector_identity = NULL_TREE;
6515 : : tree neutral_op = NULL_TREE;
6516 : : if (1)
6517 : : {
6518 : : tree initial_value = NULL_TREE;
6519 : : if (REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (reduc_info)))
6520 : : initial_value = reduc_info->reduc_initial_values[0];
6521 : : neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6522 : : initial_value, false);
6523 : : }
6524 : : if (neutral_op)
6525 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
6526 : : neutral_op);
6527 : : for (unsigned int i = 0; i < group_size; ++i)
6528 : : {
6529 : : /* If there's no univeral neutral value, we can use the
6530 : : initial scalar value from the original PHI. This is used
6531 : : for MIN and MAX reduction, for example. */
6532 : : if (!neutral_op)
6533 : : {
6534 : : tree scalar_value = reduc_info->reduc_initial_values[i];
6535 : : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6536 : : scalar_value);
6537 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
6538 : : scalar_value);
6539 : : }
6540 : :
6541 : : /* Calculate the equivalent of:
6542 : :
6543 : : sel[j] = (index[j] == i);
6544 : :
6545 : : which selects the elements of REDUC_INPUTS[0] that should
6546 : : be included in the result. */
6547 : : tree compare_val = build_int_cst (index_elt_type, i);
6548 : : compare_val = build_vector_from_val (index_type, compare_val);
6549 : : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6550 : : index, compare_val);
6551 : :
6552 : : /* Calculate the equivalent of:
6553 : :
6554 : : vec = seq ? reduc_inputs[0] : vector_identity;
6555 : :
6556 : : VEC is now suitable for a full vector reduction. */
6557 : : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6558 : : sel, reduc_inputs[0], vector_identity);
6559 : :
6560 : : /* Do the reduction and convert it to the appropriate type. */
6561 : : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6562 : : TREE_TYPE (vectype), vec);
6563 : : scalar = gimple_convert (&seq, scalar_type, scalar);
6564 : : scalar_results.safe_push (scalar);
6565 : : }
6566 : : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6567 : : }
6568 : : else
6569 : : {
6570 : 1562 : bool reduce_with_shift;
6571 : 1562 : tree vec_temp;
6572 : :
6573 : 1562 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6574 : :
6575 : : /* See if the target wants to do the final (shift) reduction
6576 : : in a vector mode of smaller size and first reduce upper/lower
6577 : : halves against each other. */
6578 : 1716 : enum machine_mode mode1 = mode;
6579 : 1716 : tree stype = TREE_TYPE (vectype);
6580 : 1716 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6581 : 1716 : unsigned nunits1 = nunits;
6582 : 1716 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6583 : 1716 : && reduc_inputs.length () == 1)
6584 : : {
6585 : 39 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6586 : : /* For SLP reductions we have to make sure lanes match up, but
6587 : : since we're doing individual element final reduction reducing
6588 : : vector width here is even more important.
6589 : : ??? We can also separate lanes with permutes, for the common
6590 : : case of power-of-two group-size odd/even extracts would work. */
6591 : 39 : if (slp_reduc && nunits != nunits1)
6592 : : {
6593 : 39 : nunits1 = least_common_multiple (nunits1, group_size);
6594 : 78 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6595 : : }
6596 : : }
6597 : 1716 : if (!slp_reduc
6598 : 1716 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6599 : 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6600 : :
6601 : 1716 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6602 : 1716 : stype, nunits1);
6603 : 1716 : reduce_with_shift = have_whole_vector_shift (mode1);
6604 : 708 : if (!VECTOR_MODE_P (mode1)
6605 : 2424 : || !directly_supported_p (code, vectype1))
6606 : : reduce_with_shift = false;
6607 : :
6608 : : /* First reduce the vector to the desired vector size we should
6609 : : do shift reduction on by combining upper and lower halves. */
6610 : 1716 : gimple_seq stmts = NULL;
6611 : 1716 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6612 : : code, &stmts);
6613 : 1716 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6614 : 1716 : reduc_inputs[0] = new_temp;
6615 : :
6616 : 1716 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
6617 : : {
6618 : 1521 : int element_bitsize = tree_to_uhwi (bitsize);
6619 : : /* Enforced by vectorizable_reduction, which disallows SLP reductions
6620 : : for variable-length vectors and also requires direct target support
6621 : : for loop reductions. */
6622 : 1521 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6623 : 1521 : int nelements = vec_size_in_bits / element_bitsize;
6624 : 1521 : vec_perm_builder sel;
6625 : 1521 : vec_perm_indices indices;
6626 : :
6627 : 1521 : int elt_offset;
6628 : :
6629 : 1521 : tree zero_vec = build_zero_cst (vectype1);
6630 : : /* Case 2: Create:
6631 : : for (offset = nelements/2; offset >= 1; offset/=2)
6632 : : {
6633 : : Create: va' = vec_shift <va, offset>
6634 : : Create: va = vop <va, va'>
6635 : : } */
6636 : :
6637 : 1521 : tree rhs;
6638 : :
6639 : 1521 : if (dump_enabled_p ())
6640 : 313 : dump_printf_loc (MSG_NOTE, vect_location,
6641 : : "Reduce using vector shifts\n");
6642 : :
6643 : 1521 : gimple_seq stmts = NULL;
6644 : 1521 : new_temp = gimple_convert (&stmts, vectype1, new_temp);
6645 : 1521 : for (elt_offset = nelements / 2;
6646 : 3312 : elt_offset >= 1;
6647 : 1791 : elt_offset /= 2)
6648 : : {
6649 : 1791 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6650 : 1791 : indices.new_vector (sel, 2, nelements);
6651 : 1791 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6652 : 1791 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6653 : : new_temp, zero_vec, mask);
6654 : 1791 : new_temp = gimple_build (&stmts, code,
6655 : : vectype1, new_name, new_temp);
6656 : : }
6657 : 1521 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6658 : :
6659 : : /* 2.4 Extract the final scalar result. Create:
6660 : : s_out3 = extract_field <v_out2, bitpos> */
6661 : :
6662 : 1521 : if (dump_enabled_p ())
6663 : 313 : dump_printf_loc (MSG_NOTE, vect_location,
6664 : : "extract scalar result\n");
6665 : :
6666 : 1521 : rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6667 : : bitsize, bitsize_zero_node);
6668 : 1521 : epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6669 : 1521 : new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6670 : 1521 : gimple_assign_set_lhs (epilog_stmt, new_temp);
6671 : 1521 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6672 : 1521 : scalar_results.safe_push (new_temp);
6673 : 1521 : }
6674 : : else
6675 : : {
6676 : : /* Case 3: Create:
6677 : : s = extract_field <v_out2, 0>
6678 : : for (offset = element_size;
6679 : : offset < vector_size;
6680 : : offset += element_size;)
6681 : : {
6682 : : Create: s' = extract_field <v_out2, offset>
6683 : : Create: s = op <s, s'> // For non SLP cases
6684 : : } */
6685 : :
6686 : 195 : if (dump_enabled_p ())
6687 : 120 : dump_printf_loc (MSG_NOTE, vect_location,
6688 : : "Reduce using scalar code.\n");
6689 : :
6690 : 195 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6691 : 195 : int element_bitsize = tree_to_uhwi (bitsize);
6692 : 195 : tree compute_type = TREE_TYPE (vectype);
6693 : 195 : gimple_seq stmts = NULL;
6694 : 435 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6695 : : {
6696 : 240 : int bit_offset;
6697 : 480 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6698 : 240 : vec_temp, bitsize, bitsize_zero_node);
6699 : :
6700 : : /* In SLP we don't need to apply reduction operation, so we just
6701 : : collect s' values in SCALAR_RESULTS. */
6702 : 240 : if (slp_reduc)
6703 : 230 : scalar_results.safe_push (new_temp);
6704 : :
6705 : 520 : for (bit_offset = element_bitsize;
6706 : 760 : bit_offset < vec_size_in_bits;
6707 : 520 : bit_offset += element_bitsize)
6708 : : {
6709 : 520 : tree bitpos = bitsize_int (bit_offset);
6710 : 520 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6711 : : compute_type, vec_temp,
6712 : : bitsize, bitpos);
6713 : 520 : if (slp_reduc)
6714 : : {
6715 : : /* In SLP we don't need to apply reduction operation, so
6716 : : we just collect s' values in SCALAR_RESULTS. */
6717 : 510 : new_temp = new_name;
6718 : 510 : scalar_results.safe_push (new_name);
6719 : : }
6720 : : else
6721 : 10 : new_temp = gimple_build (&stmts, code, compute_type,
6722 : : new_name, new_temp);
6723 : : }
6724 : : }
6725 : :
6726 : : /* The only case where we need to reduce scalar results in SLP, is
6727 : : unrolling. If the size of SCALAR_RESULTS is greater than
6728 : : REDUC_GROUP_SIZE, we reduce them combining elements modulo
6729 : : REDUC_GROUP_SIZE. */
6730 : 195 : if (slp_reduc)
6731 : : {
6732 : 185 : tree res, first_res, new_res;
6733 : :
6734 : : /* Reduce multiple scalar results in case of SLP unrolling. */
6735 : 432 : for (j = group_size; scalar_results.iterate (j, &res);
6736 : : j++)
6737 : : {
6738 : 247 : first_res = scalar_results[j % group_size];
6739 : 247 : new_res = gimple_build (&stmts, code, compute_type,
6740 : : first_res, res);
6741 : 247 : scalar_results[j % group_size] = new_res;
6742 : : }
6743 : 185 : scalar_results.truncate (group_size);
6744 : 863 : for (k = 0; k < group_size; k++)
6745 : 986 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6746 : 493 : scalar_results[k]);
6747 : : }
6748 : : else
6749 : : {
6750 : : /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6751 : 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6752 : 10 : scalar_results.safe_push (new_temp);
6753 : : }
6754 : :
6755 : 195 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6756 : : }
6757 : :
6758 : 1716 : if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6759 : 0 : && induc_val)
6760 : : {
6761 : : /* Earlier we set the initial value to be a vector if induc_val
6762 : : values. Check the result and if it is induc_val then replace
6763 : : with the original initial value, unless induc_val is
6764 : : the same as initial_def already. */
6765 : 0 : tree zcompare = make_ssa_name (boolean_type_node);
6766 : 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6767 : 0 : scalar_results[0], induc_val);
6768 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6769 : 0 : tree initial_def = reduc_info->reduc_initial_values[0];
6770 : 0 : tree tmp = make_ssa_name (new_scalar_dest);
6771 : 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6772 : 0 : initial_def, scalar_results[0]);
6773 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6774 : 0 : scalar_results[0] = tmp;
6775 : : }
6776 : : }
6777 : :
6778 : : /* 2.5 Adjust the final result by the initial value of the reduction
6779 : : variable. (When such adjustment is not needed, then
6780 : : 'adjustment_def' is zero). For example, if code is PLUS we create:
6781 : : new_temp = loop_exit_def + adjustment_def */
6782 : :
6783 : 21196 : if (adjustment_def)
6784 : : {
6785 : 15675 : gcc_assert (!slp_reduc || group_size == 1);
6786 : 15675 : gimple_seq stmts = NULL;
6787 : 15675 : if (double_reduc)
6788 : : {
6789 : 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6790 : 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6791 : 0 : new_temp = gimple_build (&stmts, code, vectype,
6792 : 0 : reduc_inputs[0], adjustment_def);
6793 : : }
6794 : : else
6795 : : {
6796 : 15675 : new_temp = scalar_results[0];
6797 : 15675 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6798 : 15675 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6799 : : adjustment_def);
6800 : 15675 : new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6801 : 15675 : new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6802 : : new_temp, adjustment_def);
6803 : 15675 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6804 : : }
6805 : :
6806 : 15675 : epilog_stmt = gimple_seq_last_stmt (stmts);
6807 : 15675 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6808 : 15675 : scalar_results[0] = new_temp;
6809 : : }
6810 : :
6811 : : /* Record this operation if it could be reused by the epilogue loop. */
6812 : 21196 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6813 : 21196 : && reduc_inputs.length () == 1)
6814 : 21022 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6815 : : { orig_reduc_input, reduc_info });
6816 : :
6817 : 21196 : if (double_reduc)
6818 : 66 : loop = outer_loop;
6819 : :
6820 : : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6821 : : phis with new adjusted scalar results, i.e., replace use <s_out0>
6822 : : with use <s_out4>.
6823 : :
6824 : : Transform:
6825 : : loop_exit:
6826 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6827 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6828 : : v_out2 = reduce <v_out1>
6829 : : s_out3 = extract_field <v_out2, 0>
6830 : : s_out4 = adjust_result <s_out3>
6831 : : use <s_out0>
6832 : : use <s_out0>
6833 : :
6834 : : into:
6835 : :
6836 : : loop_exit:
6837 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6838 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6839 : : v_out2 = reduce <v_out1>
6840 : : s_out3 = extract_field <v_out2, 0>
6841 : : s_out4 = adjust_result <s_out3>
6842 : : use <s_out4>
6843 : : use <s_out4> */
6844 : :
6845 : 42392 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6846 : 21196 : auto_vec<gimple *> phis;
6847 : 42700 : for (k = 0; k < live_out_stmts.size (); k++)
6848 : : {
6849 : 21504 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6850 : 21504 : scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6851 : :
6852 : : /* Find the loop-closed-use at the loop exit of the original scalar
6853 : : result. (The reduction result is expected to have two immediate uses,
6854 : : one at the latch block, and one at the loop exit). For double
6855 : : reductions we are looking for exit phis of the outer loop. */
6856 : 88685 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6857 : : {
6858 : 67181 : if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6859 : : {
6860 : 21499 : if (!is_gimple_debug (USE_STMT (use_p))
6861 : 21499 : && gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6862 : 21491 : phis.safe_push (USE_STMT (use_p));
6863 : : }
6864 : : else
6865 : : {
6866 : 45682 : if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6867 : : {
6868 : 66 : tree phi_res = PHI_RESULT (USE_STMT (use_p));
6869 : :
6870 : 132 : FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6871 : : {
6872 : 66 : if (!flow_bb_inside_loop_p (loop,
6873 : 66 : gimple_bb (USE_STMT (phi_use_p)))
6874 : 66 : && !is_gimple_debug (USE_STMT (phi_use_p)))
6875 : 0 : phis.safe_push (USE_STMT (phi_use_p));
6876 : : }
6877 : : }
6878 : : }
6879 : : }
6880 : :
6881 : 42995 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6882 : : {
6883 : : /* Replace the uses: */
6884 : 21491 : orig_name = PHI_RESULT (exit_phi);
6885 : :
6886 : : /* Look for a single use at the target of the skip edge. */
6887 : 21491 : if (unify_with_main_loop_p)
6888 : : {
6889 : 31 : use_operand_p use_p;
6890 : 31 : gimple *user;
6891 : 31 : if (!single_imm_use (orig_name, &use_p, &user))
6892 : 0 : gcc_unreachable ();
6893 : 31 : orig_name = gimple_get_lhs (user);
6894 : : }
6895 : :
6896 : 21491 : scalar_result = scalar_results[k];
6897 : 58631 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6898 : : {
6899 : 111464 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6900 : 37162 : SET_USE (use_p, scalar_result);
6901 : 37140 : update_stmt (use_stmt);
6902 : 21491 : }
6903 : : }
6904 : :
6905 : 21504 : phis.truncate (0);
6906 : : }
6907 : 21196 : }
6908 : :
6909 : : /* Return a vector of type VECTYPE that is equal to the vector select
6910 : : operation "MASK ? VEC : IDENTITY". Insert the select statements
6911 : : before GSI. */
6912 : :
6913 : : static tree
6914 : 0 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6915 : : tree vec, tree identity)
6916 : : {
6917 : 0 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6918 : 0 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6919 : : mask, vec, identity);
6920 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6921 : 0 : return cond;
6922 : : }
6923 : :
6924 : : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6925 : : order, starting with LHS. Insert the extraction statements before GSI and
6926 : : associate the new scalar SSA names with variable SCALAR_DEST.
6927 : : If MASK is nonzero mask the input and then operate on it unconditionally.
6928 : : Return the SSA name for the result. */
6929 : :
6930 : : static tree
6931 : 995 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6932 : : tree_code code, tree lhs, tree vector_rhs,
6933 : : tree mask)
6934 : : {
6935 : 995 : tree vectype = TREE_TYPE (vector_rhs);
6936 : 995 : tree scalar_type = TREE_TYPE (vectype);
6937 : 995 : tree bitsize = TYPE_SIZE (scalar_type);
6938 : 995 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6939 : 995 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6940 : :
6941 : : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6942 : : to perform an unconditional element-wise reduction of it. */
6943 : 995 : if (mask)
6944 : : {
6945 : 7 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6946 : : "masked_vector_rhs");
6947 : 7 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6948 : : false);
6949 : 7 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
6950 : 7 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6951 : : mask, vector_rhs, vector_identity);
6952 : 7 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6953 : 7 : vector_rhs = masked_vector_rhs;
6954 : : }
6955 : :
6956 : 995 : for (unsigned HOST_WIDE_INT bit_offset = 0;
6957 : 4259 : bit_offset < vec_size_in_bits;
6958 : 3264 : bit_offset += element_bitsize)
6959 : : {
6960 : 3264 : tree bitpos = bitsize_int (bit_offset);
6961 : 3264 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6962 : : bitsize, bitpos);
6963 : :
6964 : 3264 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6965 : 3264 : rhs = make_ssa_name (scalar_dest, stmt);
6966 : 3264 : gimple_assign_set_lhs (stmt, rhs);
6967 : 3264 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6968 : : /* Fold the vector extract, combining it with a previous reversal
6969 : : like seen in PR90579. */
6970 : 3264 : auto gsi2 = gsi_for_stmt (stmt);
6971 : 3264 : if (fold_stmt (&gsi2, follow_all_ssa_edges))
6972 : 356 : update_stmt (gsi_stmt (gsi2));
6973 : :
6974 : 3264 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6975 : 3264 : tree new_name = make_ssa_name (scalar_dest, stmt);
6976 : 3264 : gimple_assign_set_lhs (stmt, new_name);
6977 : 3264 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6978 : 3264 : lhs = new_name;
6979 : : }
6980 : 995 : return lhs;
6981 : : }
6982 : :
6983 : : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6984 : : type of the vector input. */
6985 : :
6986 : : static internal_fn
6987 : 842 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6988 : : {
6989 : 842 : internal_fn mask_reduc_fn;
6990 : 842 : internal_fn mask_len_reduc_fn;
6991 : :
6992 : 842 : switch (reduc_fn)
6993 : : {
6994 : 0 : case IFN_FOLD_LEFT_PLUS:
6995 : 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6996 : 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6997 : 0 : break;
6998 : :
6999 : : default:
7000 : : return IFN_LAST;
7001 : : }
7002 : :
7003 : 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7004 : : OPTIMIZE_FOR_SPEED))
7005 : : return mask_reduc_fn;
7006 : 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7007 : : OPTIMIZE_FOR_SPEED))
7008 : : return mask_len_reduc_fn;
7009 : : return IFN_LAST;
7010 : : }
7011 : :
7012 : : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7013 : : statement that sets the live-out value. REDUC_DEF_STMT is the phi
7014 : : statement. CODE is the operation performed by STMT_INFO and OPS are
7015 : : its scalar operands. REDUC_INDEX is the index of the operand in
7016 : : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7017 : : implements in-order reduction, or IFN_LAST if we should open-code it.
7018 : : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7019 : : that should be used to control the operation in a fully-masked loop. */
7020 : :
7021 : : static bool
7022 : 834 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7023 : : stmt_vec_info stmt_info,
7024 : : gimple_stmt_iterator *gsi,
7025 : : slp_tree slp_node,
7026 : : gimple *reduc_def_stmt,
7027 : : code_helper code, internal_fn reduc_fn,
7028 : : int num_ops, tree vectype_in,
7029 : : int reduc_index, vec_loop_masks *masks,
7030 : : vec_loop_lens *lens)
7031 : : {
7032 : 834 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7033 : 834 : tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7034 : 834 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7035 : :
7036 : 834 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7037 : :
7038 : 834 : bool is_cond_op = false;
7039 : 834 : if (!code.is_tree_code ())
7040 : : {
7041 : 7 : code = conditional_internal_fn_code (internal_fn (code));
7042 : 7 : gcc_assert (code != ERROR_MARK);
7043 : : is_cond_op = true;
7044 : : }
7045 : :
7046 : 834 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7047 : :
7048 : 834 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7049 : : TYPE_VECTOR_SUBPARTS (vectype_in)));
7050 : :
7051 : : /* The operands either come from a binary operation or an IFN_COND operation.
7052 : : The former is a gimple assign with binary rhs and the latter is a
7053 : : gimple call with four arguments. */
7054 : 834 : gcc_assert (num_ops == 2 || num_ops == 4);
7055 : :
7056 : 834 : int group_size = 1;
7057 : 834 : stmt_vec_info scalar_dest_def_info;
7058 : 834 : auto_vec<tree> vec_oprnds0, vec_opmask;
7059 : 834 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
7060 : 834 : + (1 - reduc_index)],
7061 : : &vec_oprnds0);
7062 : 834 : group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7063 : 834 : scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7064 : : /* For an IFN_COND_OP we also need the vector mask operand. */
7065 : 834 : if (is_cond_op)
7066 : 7 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
7067 : :
7068 : 834 : gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7069 : 834 : tree scalar_dest = gimple_get_lhs (sdef);
7070 : 834 : tree scalar_type = TREE_TYPE (scalar_dest);
7071 : 834 : tree reduc_var = gimple_phi_result (reduc_def_stmt);
7072 : :
7073 : 834 : int vec_num = vec_oprnds0.length ();
7074 : 834 : tree vec_elem_type = TREE_TYPE (vectype_out);
7075 : 834 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7076 : :
7077 : 834 : tree vector_identity = NULL_TREE;
7078 : 834 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7079 : : {
7080 : 0 : vector_identity = build_zero_cst (vectype_out);
7081 : 0 : if (!HONOR_SIGNED_ZEROS (vectype_out))
7082 : : ;
7083 : : else
7084 : : {
7085 : 0 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7086 : 0 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7087 : : vector_identity);
7088 : : }
7089 : : }
7090 : :
7091 : 834 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7092 : 834 : int i;
7093 : 834 : tree def0;
7094 : 1829 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7095 : : {
7096 : 995 : gimple *new_stmt;
7097 : 995 : tree mask = NULL_TREE;
7098 : 995 : tree len = NULL_TREE;
7099 : 995 : tree bias = NULL_TREE;
7100 : 995 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7101 : : {
7102 : 0 : tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7103 : : vec_num, vectype_in, i);
7104 : 0 : if (is_cond_op)
7105 : 0 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
7106 : 0 : loop_mask, vec_opmask[i], gsi);
7107 : : else
7108 : : mask = loop_mask;
7109 : : }
7110 : 995 : else if (is_cond_op)
7111 : 7 : mask = vec_opmask[i];
7112 : 995 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7113 : : {
7114 : 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7115 : : i, 1);
7116 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7117 : 0 : bias = build_int_cst (intQI_type_node, biasval);
7118 : 0 : if (!is_cond_op)
7119 : 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
7120 : : }
7121 : :
7122 : : /* Handle MINUS by adding the negative. */
7123 : 995 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7124 : : {
7125 : 0 : tree negated = make_ssa_name (vectype_out);
7126 : 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7127 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7128 : 0 : def0 = negated;
7129 : : }
7130 : :
7131 : 0 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7132 : 995 : && mask && mask_reduc_fn == IFN_LAST)
7133 : 0 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7134 : : vector_identity);
7135 : :
7136 : : /* On the first iteration the input is simply the scalar phi
7137 : : result, and for subsequent iterations it is the output of
7138 : : the preceding operation. */
7139 : 995 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7140 : : {
7141 : 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7142 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7143 : : def0, mask, len, bias);
7144 : 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7145 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7146 : : def0, mask);
7147 : : else
7148 : 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7149 : : def0);
7150 : : /* For chained SLP reductions the output of the previous reduction
7151 : : operation serves as the input of the next. For the final statement
7152 : : the output cannot be a temporary - we reuse the original
7153 : : scalar destination of the last statement. */
7154 : 0 : if (i != vec_num - 1)
7155 : : {
7156 : 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
7157 : 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7158 : 0 : gimple_set_lhs (new_stmt, reduc_var);
7159 : : }
7160 : : }
7161 : : else
7162 : : {
7163 : 995 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7164 : : tree_code (code), reduc_var, def0,
7165 : : mask);
7166 : 995 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7167 : : /* Remove the statement, so that we can use the same code paths
7168 : : as for statements that we've just created. */
7169 : 995 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7170 : 995 : gsi_remove (&tmp_gsi, true);
7171 : : }
7172 : :
7173 : 995 : if (i == vec_num - 1)
7174 : : {
7175 : 834 : gimple_set_lhs (new_stmt, scalar_dest);
7176 : 834 : vect_finish_replace_stmt (loop_vinfo,
7177 : : scalar_dest_def_info,
7178 : : new_stmt);
7179 : : }
7180 : : else
7181 : 161 : vect_finish_stmt_generation (loop_vinfo,
7182 : : scalar_dest_def_info,
7183 : : new_stmt, gsi);
7184 : :
7185 : 995 : slp_node->push_vec_def (new_stmt);
7186 : : }
7187 : :
7188 : 834 : return true;
7189 : 834 : }
7190 : :
7191 : : /* Function is_nonwrapping_integer_induction.
7192 : :
7193 : : Check if STMT_VINO (which is part of loop LOOP) both increments and
7194 : : does not cause overflow. */
7195 : :
7196 : : static bool
7197 : 377 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7198 : : {
7199 : 377 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7200 : 377 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7201 : 377 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7202 : 377 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7203 : 377 : widest_int ni, max_loop_value, lhs_max;
7204 : 377 : wi::overflow_type overflow = wi::OVF_NONE;
7205 : :
7206 : : /* Make sure the loop is integer based. */
7207 : 377 : if (TREE_CODE (base) != INTEGER_CST
7208 : 112 : || TREE_CODE (step) != INTEGER_CST)
7209 : : return false;
7210 : :
7211 : : /* Check that the max size of the loop will not wrap. */
7212 : :
7213 : 112 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7214 : : return true;
7215 : :
7216 : 8 : if (! max_stmt_executions (loop, &ni))
7217 : : return false;
7218 : :
7219 : 8 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7220 : 8 : &overflow);
7221 : 8 : if (overflow)
7222 : : return false;
7223 : :
7224 : 8 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7225 : 16 : TYPE_SIGN (lhs_type), &overflow);
7226 : 8 : if (overflow)
7227 : : return false;
7228 : :
7229 : 8 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7230 : 8 : <= TYPE_PRECISION (lhs_type));
7231 : 377 : }
7232 : :
7233 : : /* Check if masking can be supported by inserting a conditional expression.
7234 : : CODE is the code for the operation. COND_FN is the conditional internal
7235 : : function, if it exists. VECTYPE_IN is the type of the vector input. */
7236 : : static bool
7237 : 2305 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7238 : : tree vectype_in)
7239 : : {
7240 : 2305 : if (cond_fn != IFN_LAST
7241 : 2305 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
7242 : : OPTIMIZE_FOR_SPEED))
7243 : : return false;
7244 : :
7245 : 2259 : if (code.is_tree_code ())
7246 : 1984 : switch (tree_code (code))
7247 : : {
7248 : : case DOT_PROD_EXPR:
7249 : : case SAD_EXPR:
7250 : : return true;
7251 : :
7252 : : default:
7253 : : break;
7254 : : }
7255 : : return false;
7256 : : }
7257 : :
7258 : : /* Insert a conditional expression to enable masked vectorization. CODE is the
7259 : : code for the operation. VOP is the array of operands. MASK is the loop
7260 : : mask. GSI is a statement iterator used to place the new conditional
7261 : : expression. */
7262 : : static void
7263 : 4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7264 : : gimple_stmt_iterator *gsi)
7265 : : {
7266 : 4 : switch (tree_code (code))
7267 : : {
7268 : 4 : case DOT_PROD_EXPR:
7269 : 4 : {
7270 : 4 : tree vectype = TREE_TYPE (vop[1]);
7271 : 4 : tree zero = build_zero_cst (vectype);
7272 : 4 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7273 : 4 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7274 : : mask, vop[1], zero);
7275 : 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
7276 : 4 : vop[1] = masked_op1;
7277 : 4 : break;
7278 : : }
7279 : :
7280 : 0 : case SAD_EXPR:
7281 : 0 : {
7282 : 0 : tree vectype = TREE_TYPE (vop[1]);
7283 : 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7284 : 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7285 : : mask, vop[1], vop[0]);
7286 : 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
7287 : 0 : vop[1] = masked_op1;
7288 : 0 : break;
7289 : : }
7290 : :
7291 : 0 : default:
7292 : 0 : gcc_unreachable ();
7293 : : }
7294 : 4 : }
7295 : :
7296 : : /* Given an operation with CODE in loop reduction path whose reduction PHI is
7297 : : specified by REDUC_INFO, the operation has TYPE of scalar result, and its
7298 : : input vectype is represented by VECTYPE_IN. The vectype of vectorized result
7299 : : may be different from VECTYPE_IN, either in base type or vectype lanes,
7300 : : lane-reducing operation is the case. This function check if it is possible,
7301 : : and how to perform partial vectorization on the operation in the context
7302 : : of LOOP_VINFO. */
7303 : :
7304 : : static void
7305 : 8 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
7306 : : stmt_vec_info reduc_info,
7307 : : slp_tree slp_node,
7308 : : code_helper code, tree type,
7309 : : tree vectype_in)
7310 : : {
7311 : 8 : enum vect_reduction_type reduc_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7312 : 8 : internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7313 : 8 : internal_fn cond_fn = get_conditional_internal_fn (code, type);
7314 : :
7315 : 8 : if (reduc_type != FOLD_LEFT_REDUCTION
7316 : 8 : && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7317 : 12 : && (cond_fn == IFN_LAST
7318 : 4 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7319 : : OPTIMIZE_FOR_SPEED)))
7320 : : {
7321 : 0 : if (dump_enabled_p ())
7322 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7323 : : "can't operate on partial vectors because"
7324 : : " no conditional operation is available.\n");
7325 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7326 : : }
7327 : 8 : else if (reduc_type == FOLD_LEFT_REDUCTION
7328 : 8 : && reduc_fn == IFN_LAST
7329 : 8 : && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
7330 : : {
7331 : 0 : if (dump_enabled_p ())
7332 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7333 : : "can't operate on partial vectors because"
7334 : : " no conditional operation is available.\n");
7335 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7336 : : }
7337 : 8 : else if (reduc_type == FOLD_LEFT_REDUCTION
7338 : 0 : && internal_fn_mask_index (reduc_fn) == -1
7339 : 0 : && FLOAT_TYPE_P (vectype_in)
7340 : 8 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
7341 : : {
7342 : 0 : if (dump_enabled_p ())
7343 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7344 : : "can't operate on partial vectors because"
7345 : : " signed zeros cannot be preserved.\n");
7346 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7347 : : }
7348 : : else
7349 : : {
7350 : 8 : internal_fn mask_reduc_fn
7351 : 8 : = get_masked_reduction_fn (reduc_fn, vectype_in);
7352 : 8 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7353 : 8 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7354 : 8 : unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node,
7355 : : vectype_in);
7356 : :
7357 : 8 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7358 : 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
7359 : : else
7360 : 8 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
7361 : : }
7362 : 8 : }
7363 : :
7364 : : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
7365 : : the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
7366 : : and the analysis is for slp if SLP_NODE is not NULL.
7367 : :
7368 : : For a lane-reducing operation, the loop reduction path that it lies in,
7369 : : may contain normal operation, or other lane-reducing operation of different
7370 : : input type size, an example as:
7371 : :
7372 : : int sum = 0;
7373 : : for (i)
7374 : : {
7375 : : ...
7376 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
7377 : : sum += w[i]; // widen-sum <vector(16) char>
7378 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
7379 : : sum += n[i]; // normal <vector(4) int>
7380 : : ...
7381 : : }
7382 : :
7383 : : Vectorization factor is essentially determined by operation whose input
7384 : : vectype has the most lanes ("vector(16) char" in the example), while we
7385 : : need to choose input vectype with the least lanes ("vector(4) int" in the
7386 : : example) to determine effective number of vector reduction PHIs. */
7387 : :
7388 : : bool
7389 : 444288 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7390 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7391 : : {
7392 : 444288 : gimple *stmt = stmt_info->stmt;
7393 : :
7394 : 444288 : if (!lane_reducing_stmt_p (stmt))
7395 : : return false;
7396 : :
7397 : 336 : tree type = TREE_TYPE (gimple_assign_lhs (stmt));
7398 : :
7399 : 336 : if (!INTEGRAL_TYPE_P (type))
7400 : : return false;
7401 : :
7402 : : /* Do not try to vectorize bit-precision reductions. */
7403 : 336 : if (!type_has_mode_precision_p (type))
7404 : : return false;
7405 : :
7406 : 336 : stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7407 : :
7408 : : /* TODO: Support lane-reducing operation that does not directly participate
7409 : : in loop reduction. */
7410 : 336 : if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
7411 : : return false;
7412 : :
7413 : : /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
7414 : : recoginized. */
7415 : 336 : gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
7416 : 336 : gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
7417 : :
7418 : 1344 : for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
7419 : : {
7420 : 1008 : stmt_vec_info def_stmt_info;
7421 : 1008 : slp_tree slp_op;
7422 : 1008 : tree op;
7423 : 1008 : tree vectype;
7424 : 1008 : enum vect_def_type dt;
7425 : :
7426 : 1008 : if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
7427 : : &slp_op, &dt, &vectype, &def_stmt_info))
7428 : : {
7429 : 0 : if (dump_enabled_p ())
7430 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7431 : : "use not simple.\n");
7432 : 0 : return false;
7433 : : }
7434 : :
7435 : 1008 : if (!vectype)
7436 : : {
7437 : 12 : vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
7438 : : slp_op);
7439 : 12 : if (!vectype)
7440 : : return false;
7441 : : }
7442 : :
7443 : 1008 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
7444 : : {
7445 : 0 : if (dump_enabled_p ())
7446 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7447 : : "incompatible vector types for invariants\n");
7448 : 0 : return false;
7449 : : }
7450 : :
7451 : 1008 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7452 : 336 : continue;
7453 : :
7454 : : /* There should be at most one cycle def in the stmt. */
7455 : 672 : if (VECTORIZABLE_CYCLE_DEF (dt))
7456 : : return false;
7457 : : }
7458 : :
7459 : 336 : tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
7460 : :
7461 : 336 : gcc_assert (vectype_in);
7462 : :
7463 : : /* Compute number of effective vector statements for costing. */
7464 : 336 : unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, slp_node,
7465 : : vectype_in);
7466 : 336 : gcc_assert (ncopies_for_cost >= 1);
7467 : :
7468 : 336 : if (vect_is_emulated_mixed_dot_prod (stmt_info))
7469 : : {
7470 : : /* We need extra two invariants: one that contains the minimum signed
7471 : : value and one that contains half of its negative. */
7472 : 6 : int prologue_stmts = 2;
7473 : 6 : unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
7474 : : scalar_to_vec, stmt_info, 0,
7475 : : vect_prologue);
7476 : 6 : if (dump_enabled_p ())
7477 : 0 : dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
7478 : : "extra prologue_cost = %d .\n", cost);
7479 : :
7480 : : /* Three dot-products and a subtraction. */
7481 : 6 : ncopies_for_cost *= 4;
7482 : : }
7483 : :
7484 : 336 : record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, stmt_info,
7485 : : 0, vect_body);
7486 : :
7487 : 336 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7488 : : {
7489 : 4 : enum tree_code code = gimple_assign_rhs_code (stmt);
7490 : 4 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7491 : 4 : slp_node, code, type,
7492 : : vectype_in);
7493 : : }
7494 : :
7495 : : /* Transform via vect_transform_reduction. */
7496 : 336 : STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7497 : 336 : return true;
7498 : : }
7499 : :
7500 : : /* Function vectorizable_reduction.
7501 : :
7502 : : Check if STMT_INFO performs a reduction operation that can be vectorized.
7503 : : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7504 : : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7505 : : Return true if STMT_INFO is vectorizable in this way.
7506 : :
7507 : : This function also handles reduction idioms (patterns) that have been
7508 : : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7509 : : may be of this form:
7510 : : X = pattern_expr (arg0, arg1, ..., X)
7511 : : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7512 : : sequence that had been detected and replaced by the pattern-stmt
7513 : : (STMT_INFO).
7514 : :
7515 : : This function also handles reduction of condition expressions, for example:
7516 : : for (int i = 0; i < N; i++)
7517 : : if (a[i] < value)
7518 : : last = a[i];
7519 : : This is handled by vectorising the loop and creating an additional vector
7520 : : containing the loop indexes for which "a[i] < value" was true. In the
7521 : : function epilogue this is reduced to a single max value and then used to
7522 : : index into the vector of results.
7523 : :
7524 : : In some cases of reduction patterns, the type of the reduction variable X is
7525 : : different than the type of the other arguments of STMT_INFO.
7526 : : In such cases, the vectype that is used when transforming STMT_INFO into
7527 : : a vector stmt is different than the vectype that is used to determine the
7528 : : vectorization factor, because it consists of a different number of elements
7529 : : than the actual number of elements that are being operated upon in parallel.
7530 : :
7531 : : For example, consider an accumulation of shorts into an int accumulator.
7532 : : On some targets it's possible to vectorize this pattern operating on 8
7533 : : shorts at a time (hence, the vectype for purposes of determining the
7534 : : vectorization factor should be V8HI); on the other hand, the vectype that
7535 : : is used to create the vector form is actually V4SI (the type of the result).
7536 : :
7537 : : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7538 : : indicates what is the actual level of parallelism (V8HI in the example), so
7539 : : that the right vectorization factor would be derived. This vectype
7540 : : corresponds to the type of arguments to the reduction stmt, and should *NOT*
7541 : : be used to create the vectorized stmt. The right vectype for the vectorized
7542 : : stmt is obtained from the type of the result X:
7543 : : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7544 : :
7545 : : This means that, contrary to "regular" reductions (or "regular" stmts in
7546 : : general), the following equation:
7547 : : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7548 : : does *NOT* necessarily hold for reduction patterns. */
7549 : :
7550 : : bool
7551 : 443952 : vectorizable_reduction (loop_vec_info loop_vinfo,
7552 : : stmt_vec_info stmt_info, slp_tree slp_node,
7553 : : slp_instance slp_node_instance,
7554 : : stmt_vector_for_cost *cost_vec)
7555 : : {
7556 : 443952 : tree vectype_in = NULL_TREE;
7557 : 443952 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7558 : 443952 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7559 : 443952 : stmt_vec_info cond_stmt_vinfo = NULL;
7560 : 443952 : int i;
7561 : 443952 : int ncopies;
7562 : 443952 : bool single_defuse_cycle = false;
7563 : 443952 : bool nested_cycle = false;
7564 : 443952 : bool double_reduc = false;
7565 : 443952 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7566 : 443952 : tree cond_reduc_val = NULL_TREE;
7567 : :
7568 : : /* Make sure it was already recognized as a reduction computation. */
7569 : 443952 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7570 : : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7571 : 443952 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7572 : : return false;
7573 : :
7574 : : /* The stmt we store reduction analysis meta on. */
7575 : 55387 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7576 : 55387 : reduc_info->is_reduc_info = true;
7577 : :
7578 : 55387 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7579 : : {
7580 : 1793 : if (is_a <gphi *> (stmt_info->stmt))
7581 : : {
7582 : : /* We eventually need to set a vector type on invariant
7583 : : arguments. */
7584 : : unsigned j;
7585 : : slp_tree child;
7586 : 5379 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7587 : 3586 : if (!vect_maybe_update_slp_op_vectype
7588 : 3586 : (child, SLP_TREE_VECTYPE (slp_node)))
7589 : : {
7590 : 0 : if (dump_enabled_p ())
7591 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7592 : : "incompatible vector types for "
7593 : : "invariants\n");
7594 : 0 : return false;
7595 : : }
7596 : : /* Analysis for double-reduction is done on the outer
7597 : : loop PHI, nested cycles have no further restrictions. */
7598 : 1793 : STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7599 : : }
7600 : : else
7601 : 0 : STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7602 : 1793 : return true;
7603 : : }
7604 : :
7605 : 53594 : stmt_vec_info orig_stmt_of_analysis = stmt_info;
7606 : 53594 : stmt_vec_info phi_info = stmt_info;
7607 : 53594 : if (!is_a <gphi *> (stmt_info->stmt))
7608 : : {
7609 : 6827 : STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7610 : 6827 : return true;
7611 : : }
7612 : 46767 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7613 : : {
7614 : 382 : if (gimple_bb (stmt_info->stmt) != loop->header)
7615 : : {
7616 : : /* For SLP we arrive here for both the inner loop LC PHI and
7617 : : the outer loop PHI. The latter is what we want to analyze
7618 : : the reduction with. The LC PHI is handled by
7619 : : vectorizable_lc_phi. */
7620 : 109 : return gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) == 2;
7621 : : }
7622 : 273 : use_operand_p use_p;
7623 : 273 : gimple *use_stmt;
7624 : 273 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7625 : : &use_p, &use_stmt);
7626 : 273 : gcc_assert (res);
7627 : 273 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
7628 : : }
7629 : :
7630 : 46658 : slp_node_instance->reduc_phis = slp_node;
7631 : : /* ??? We're leaving slp_node to point to the PHIs, we only
7632 : : need it to get at the number of vector stmts which wasn't
7633 : : yet initialized for the instance root. */
7634 : :
7635 : : /* PHIs should not participate in patterns. */
7636 : 46658 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7637 : 46658 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7638 : :
7639 : : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7640 : : and compute the reduction chain length. Discover the real
7641 : : reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7642 : 46658 : tree reduc_def
7643 : 46658 : = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7644 : : loop_latch_edge
7645 : : (gimple_bb (reduc_def_phi)->loop_father));
7646 : 46658 : unsigned reduc_chain_length = 0;
7647 : 46658 : bool only_slp_reduc_chain = true;
7648 : 46658 : stmt_info = NULL;
7649 : 46658 : slp_tree slp_for_stmt_info = slp_node_instance->root;
7650 : : /* For double-reductions we start SLP analysis at the inner loop LC PHI
7651 : : which is the def of the outer loop live stmt. */
7652 : 46658 : if (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def)
7653 : 273 : slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7654 : 101221 : while (reduc_def != PHI_RESULT (reduc_def_phi))
7655 : : {
7656 : 54587 : stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7657 : 54587 : stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7658 : 54587 : int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
7659 : :
7660 : 54587 : if (reduc_idx == -1)
7661 : : {
7662 : 0 : if (dump_enabled_p ())
7663 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7664 : : "reduction chain broken by patterns.\n");
7665 : 24 : return false;
7666 : : }
7667 : 54587 : if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7668 : 53066 : only_slp_reduc_chain = false;
7669 : : /* For epilogue generation live members of the chain need
7670 : : to point back to the PHI via their original stmt for
7671 : : info_for_reduction to work. For SLP we need to look at
7672 : : all lanes here - even though we only will vectorize from
7673 : : the SLP node with live lane zero the other live lanes also
7674 : : need to be identified as part of a reduction to be able
7675 : : to skip code generation for them. */
7676 : 54587 : if (slp_for_stmt_info)
7677 : : {
7678 : 233869 : for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7679 : 70108 : if (STMT_VINFO_LIVE_P (s))
7680 : 54768 : STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7681 : : }
7682 : 0 : else if (STMT_VINFO_LIVE_P (vdef))
7683 : 0 : STMT_VINFO_REDUC_DEF (def) = phi_info;
7684 : 54587 : gimple_match_op op;
7685 : 54587 : if (!gimple_extract_op (vdef->stmt, &op))
7686 : : {
7687 : 0 : if (dump_enabled_p ())
7688 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7689 : : "reduction chain includes unsupported"
7690 : : " statement type.\n");
7691 : 0 : return false;
7692 : : }
7693 : 54587 : if (CONVERT_EXPR_CODE_P (op.code))
7694 : : {
7695 : 3358 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7696 : : {
7697 : 24 : if (dump_enabled_p ())
7698 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7699 : : "conversion in the reduction chain.\n");
7700 : 24 : return false;
7701 : : }
7702 : : }
7703 : : else
7704 : : {
7705 : : /* First non-conversion stmt. */
7706 : 51229 : if (!stmt_info)
7707 : 46658 : stmt_info = vdef;
7708 : :
7709 : 51229 : if (lane_reducing_op_p (op.code))
7710 : : {
7711 : 545 : enum vect_def_type dt;
7712 : 545 : tree vectype_op;
7713 : :
7714 : : /* The last operand of lane-reducing operation is for
7715 : : reduction. */
7716 : 545 : gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7717 : :
7718 : 545 : if (!vect_is_simple_use (op.ops[0], loop_vinfo, &dt, &vectype_op))
7719 : 0 : return false;
7720 : :
7721 : 545 : tree type_op = TREE_TYPE (op.ops[0]);
7722 : :
7723 : 545 : if (!vectype_op)
7724 : : {
7725 : 8 : vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7726 : : type_op);
7727 : 8 : if (!vectype_op)
7728 : : return false;
7729 : : }
7730 : :
7731 : : /* For lane-reducing operation vectorizable analysis needs the
7732 : : reduction PHI information. */
7733 : 545 : STMT_VINFO_REDUC_DEF (def) = phi_info;
7734 : :
7735 : : /* Each lane-reducing operation has its own input vectype, while
7736 : : reduction PHI will record the input vectype with the least
7737 : : lanes. */
7738 : 545 : STMT_VINFO_REDUC_VECTYPE_IN (vdef) = vectype_op;
7739 : :
7740 : : /* To accommodate lane-reducing operations of mixed input
7741 : : vectypes, choose input vectype with the least lanes for the
7742 : : reduction PHI statement, which would result in the most
7743 : : ncopies for vectorized reduction results. */
7744 : 545 : if (!vectype_in
7745 : 545 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7746 : 464 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7747 : 313 : vectype_in = vectype_op;
7748 : : }
7749 : : else
7750 : 50684 : vectype_in = STMT_VINFO_VECTYPE (phi_info);
7751 : : }
7752 : :
7753 : 54563 : reduc_def = op.ops[reduc_idx];
7754 : 54563 : reduc_chain_length++;
7755 : 54563 : if (!stmt_info)
7756 : 1671 : slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7757 : : }
7758 : : /* PHIs should not participate in patterns. */
7759 : 46634 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7760 : :
7761 : 46634 : if (nested_in_vect_loop_p (loop, stmt_info))
7762 : : {
7763 : 46634 : loop = loop->inner;
7764 : 46634 : nested_cycle = true;
7765 : : }
7766 : :
7767 : : /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7768 : : element. */
7769 : 46634 : if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7770 : : {
7771 : 272 : gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7772 : : stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7773 : : }
7774 : 46634 : if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7775 : 272 : gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7776 : :
7777 : : /* 1. Is vectorizable reduction? */
7778 : : /* Not supportable if the reduction variable is used in the loop, unless
7779 : : it's a reduction chain. */
7780 : 46634 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7781 : 46634 : && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7782 : : return false;
7783 : :
7784 : : /* Reductions that are not used even in an enclosing outer-loop,
7785 : : are expected to be "live" (used out of the loop). */
7786 : 46634 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7787 : 0 : && !STMT_VINFO_LIVE_P (stmt_info))
7788 : : return false;
7789 : :
7790 : : /* 2. Has this been recognized as a reduction pattern?
7791 : :
7792 : : Check if STMT represents a pattern that has been recognized
7793 : : in earlier analysis stages. For stmts that represent a pattern,
7794 : : the STMT_VINFO_RELATED_STMT field records the last stmt in
7795 : : the original sequence that constitutes the pattern. */
7796 : :
7797 : 46634 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7798 : 46634 : if (orig_stmt_info)
7799 : : {
7800 : 2961 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7801 : 2961 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7802 : : }
7803 : :
7804 : : /* 3. Check the operands of the operation. The first operands are defined
7805 : : inside the loop body. The last operand is the reduction variable,
7806 : : which is defined by the loop-header-phi. */
7807 : :
7808 : 46634 : tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7809 : 46634 : STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7810 : 46634 : STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7811 : :
7812 : 46634 : gimple_match_op op;
7813 : 46634 : if (!gimple_extract_op (stmt_info->stmt, &op))
7814 : 0 : gcc_unreachable ();
7815 : 46634 : bool lane_reducing = lane_reducing_op_p (op.code);
7816 : :
7817 : 46634 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7818 : 12916 : && !SCALAR_FLOAT_TYPE_P (op.type))
7819 : : return false;
7820 : :
7821 : : /* Do not try to vectorize bit-precision reductions. */
7822 : 46634 : if (!type_has_mode_precision_p (op.type))
7823 : : return false;
7824 : :
7825 : : /* Lane-reducing ops also never can be used in a SLP reduction group
7826 : : since we'll mix lanes belonging to different reductions. But it's
7827 : : OK to use them in a reduction chain or when the reduction group
7828 : : has just one element. */
7829 : 45083 : if (lane_reducing
7830 : 313 : && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7831 : 45373 : && SLP_TREE_LANES (slp_node) > 1)
7832 : : {
7833 : 0 : if (dump_enabled_p ())
7834 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7835 : : "lane-reducing reduction in reduction group.\n");
7836 : 0 : return false;
7837 : : }
7838 : :
7839 : : /* All uses but the last are expected to be defined in the loop.
7840 : : The last use is the reduction variable. In case of nested cycle this
7841 : : assumption is not true: we use reduc_index to record the index of the
7842 : : reduction variable. */
7843 : 45083 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7844 : 45083 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7845 : : /* We need to skip an extra operand for COND_EXPRs with embedded
7846 : : comparison. */
7847 : 45083 : unsigned opno_adjust = 0;
7848 : 45083 : if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7849 : 45083 : opno_adjust = 1;
7850 : 142770 : for (i = 0; i < (int) op.num_ops; i++)
7851 : : {
7852 : : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7853 : 97738 : if (i == 0 && op.code == COND_EXPR)
7854 : 49037 : continue;
7855 : :
7856 : 97037 : stmt_vec_info def_stmt_info;
7857 : 97037 : enum vect_def_type dt;
7858 : 97037 : if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7859 : 97037 : i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7860 : 97037 : &vectype_op[i], &def_stmt_info))
7861 : : {
7862 : 0 : if (dump_enabled_p ())
7863 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7864 : : "use not simple.\n");
7865 : 51 : return false;
7866 : : }
7867 : :
7868 : : /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7869 : : reduction operand twice (once as definition, once as else). */
7870 : 97037 : if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7871 : 48336 : continue;
7872 : :
7873 : : /* There should be only one cycle def in the stmt, the one
7874 : : leading to reduc_def. */
7875 : 48701 : if (VECTORIZABLE_CYCLE_DEF (dt))
7876 : : return false;
7877 : :
7878 : 48650 : if (!vectype_op[i])
7879 : 3817 : vectype_op[i]
7880 : 3817 : = get_vectype_for_scalar_type (loop_vinfo,
7881 : 3817 : TREE_TYPE (op.ops[i]), slp_op[i]);
7882 : :
7883 : : /* Record how the non-reduction-def value of COND_EXPR is defined.
7884 : : ??? For a chain of multiple CONDs we'd have to match them up all. */
7885 : 48650 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7886 : : {
7887 : 672 : if (dt == vect_constant_def)
7888 : : {
7889 : 49 : cond_reduc_dt = dt;
7890 : 49 : cond_reduc_val = op.ops[i];
7891 : : }
7892 : 623 : else if (dt == vect_induction_def
7893 : 377 : && def_stmt_info
7894 : 1000 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7895 : : {
7896 : 112 : cond_reduc_dt = dt;
7897 : 112 : cond_stmt_vinfo = def_stmt_info;
7898 : : }
7899 : : }
7900 : : }
7901 : :
7902 : 45032 : enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
7903 : 45032 : STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
7904 : : /* If we have a condition reduction, see if we can simplify it further. */
7905 : 45032 : if (reduction_type == COND_REDUCTION)
7906 : : {
7907 : 671 : if (SLP_TREE_LANES (slp_node) != 1)
7908 : : return false;
7909 : :
7910 : : /* When the condition uses the reduction value in the condition, fail. */
7911 : 671 : if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7912 : : {
7913 : 0 : if (dump_enabled_p ())
7914 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7915 : : "condition depends on previous iteration\n");
7916 : 0 : return false;
7917 : : }
7918 : :
7919 : 671 : if (reduc_chain_length == 1
7920 : 671 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7921 : : OPTIMIZE_FOR_SPEED)
7922 : 642 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7923 : : vectype_in,
7924 : : OPTIMIZE_FOR_SPEED)))
7925 : : {
7926 : 0 : if (dump_enabled_p ())
7927 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7928 : : "optimizing condition reduction with"
7929 : : " FOLD_EXTRACT_LAST.\n");
7930 : 0 : STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7931 : : }
7932 : 671 : else if (cond_reduc_dt == vect_induction_def)
7933 : : {
7934 : 99 : tree base
7935 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7936 : 99 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7937 : :
7938 : 99 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7939 : : && TREE_CODE (step) == INTEGER_CST);
7940 : 99 : cond_reduc_val = NULL_TREE;
7941 : 99 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7942 : 99 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7943 : 99 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7944 : : ;
7945 : : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7946 : : above base; punt if base is the minimum value of the type for
7947 : : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7948 : 87 : else if (tree_int_cst_sgn (step) == -1)
7949 : : {
7950 : 20 : cond_reduc_op_code = MIN_EXPR;
7951 : 20 : if (tree_int_cst_sgn (base) == -1)
7952 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7953 : 20 : else if (tree_int_cst_lt (base,
7954 : 20 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7955 : 20 : cond_reduc_val
7956 : 20 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7957 : : }
7958 : : else
7959 : : {
7960 : 67 : cond_reduc_op_code = MAX_EXPR;
7961 : 67 : if (tree_int_cst_sgn (base) == 1)
7962 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7963 : 67 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7964 : : base))
7965 : 67 : cond_reduc_val
7966 : 67 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7967 : : }
7968 : 87 : if (cond_reduc_val)
7969 : : {
7970 : 87 : if (dump_enabled_p ())
7971 : 64 : dump_printf_loc (MSG_NOTE, vect_location,
7972 : : "condition expression based on "
7973 : : "integer induction.\n");
7974 : 87 : STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7975 : 87 : STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7976 : 87 : = cond_reduc_val;
7977 : 87 : STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7978 : : }
7979 : : }
7980 : 572 : else if (cond_reduc_dt == vect_constant_def)
7981 : : {
7982 : 44 : enum vect_def_type cond_initial_dt;
7983 : 44 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7984 : 44 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7985 : 44 : if (cond_initial_dt == vect_constant_def
7986 : 61 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7987 : 17 : TREE_TYPE (cond_reduc_val)))
7988 : : {
7989 : 17 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7990 : : cond_initial_val, cond_reduc_val);
7991 : 17 : if (e && (integer_onep (e) || integer_zerop (e)))
7992 : : {
7993 : 17 : if (dump_enabled_p ())
7994 : 12 : dump_printf_loc (MSG_NOTE, vect_location,
7995 : : "condition expression based on "
7996 : : "compile time constant.\n");
7997 : : /* Record reduction code at analysis stage. */
7998 : 17 : STMT_VINFO_REDUC_CODE (reduc_info)
7999 : 17 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
8000 : 17 : STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
8001 : : }
8002 : : }
8003 : : }
8004 : : }
8005 : :
8006 : 45032 : if (STMT_VINFO_LIVE_P (phi_info))
8007 : : return false;
8008 : :
8009 : 45032 : ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8010 : :
8011 : 45032 : gcc_assert (ncopies >= 1);
8012 : :
8013 : 45032 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
8014 : :
8015 : 45032 : if (nested_cycle)
8016 : : {
8017 : 241 : gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
8018 : : == vect_double_reduction_def);
8019 : : double_reduc = true;
8020 : : }
8021 : :
8022 : : /* 4.2. Check support for the epilog operation.
8023 : :
8024 : : If STMT represents a reduction pattern, then the type of the
8025 : : reduction variable may be different than the type of the rest
8026 : : of the arguments. For example, consider the case of accumulation
8027 : : of shorts into an int accumulator; The original code:
8028 : : S1: int_a = (int) short_a;
8029 : : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
8030 : :
8031 : : was replaced with:
8032 : : STMT: int_acc = widen_sum <short_a, int_acc>
8033 : :
8034 : : This means that:
8035 : : 1. The tree-code that is used to create the vector operation in the
8036 : : epilog code (that reduces the partial results) is not the
8037 : : tree-code of STMT, but is rather the tree-code of the original
8038 : : stmt from the pattern that STMT is replacing. I.e, in the example
8039 : : above we want to use 'widen_sum' in the loop, but 'plus' in the
8040 : : epilog.
8041 : : 2. The type (mode) we use to check available target support
8042 : : for the vector operation to be created in the *epilog*, is
8043 : : determined by the type of the reduction variable (in the example
8044 : : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
8045 : : However the type (mode) we use to check available target support
8046 : : for the vector operation to be created *inside the loop*, is
8047 : : determined by the type of the other arguments to STMT (in the
8048 : : example we'd check this: optab_handler (widen_sum_optab,
8049 : : vect_short_mode)).
8050 : :
8051 : : This is contrary to "regular" reductions, in which the types of all
8052 : : the arguments are the same as the type of the reduction variable.
8053 : : For "regular" reductions we can therefore use the same vector type
8054 : : (and also the same tree-code) when generating the epilog code and
8055 : : when generating the code inside the loop. */
8056 : :
8057 : 45032 : code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
8058 : :
8059 : : /* If conversion might have created a conditional operation like
8060 : : IFN_COND_ADD already. Use the internal code for the following checks. */
8061 : 45032 : if (orig_code.is_internal_fn ())
8062 : : {
8063 : 2635 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
8064 : 2635 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
8065 : : }
8066 : :
8067 : 45032 : STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
8068 : :
8069 : 45032 : reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8070 : 45032 : if (reduction_type == TREE_CODE_REDUCTION)
8071 : : {
8072 : : /* Check whether it's ok to change the order of the computation.
8073 : : Generally, when vectorizing a reduction we change the order of the
8074 : : computation. This may change the behavior of the program in some
8075 : : cases, so we need to check that this is ok. One exception is when
8076 : : vectorizing an outer-loop: the inner-loop is executed sequentially,
8077 : : and therefore vectorizing reductions in the inner-loop during
8078 : : outer-loop vectorization is safe. Likewise when we are vectorizing
8079 : : a series of reductions using SLP and the VF is one the reductions
8080 : : are performed in scalar order. */
8081 : 42475 : if (!REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8082 : 42475 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
8083 : : ;
8084 : 42361 : else if (needs_fold_left_reduction_p (op.type, orig_code))
8085 : : {
8086 : : /* When vectorizing a reduction chain w/o SLP the reduction PHI
8087 : : is not directy used in stmt. */
8088 : 3110 : if (!only_slp_reduc_chain
8089 : 3110 : && reduc_chain_length != 1)
8090 : : {
8091 : 53 : if (dump_enabled_p ())
8092 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8093 : : "in-order reduction chain without SLP.\n");
8094 : 53 : return false;
8095 : : }
8096 : : /* Code generation doesn't support function calls other
8097 : : than .COND_*. */
8098 : 3057 : if (!op.code.is_tree_code ()
8099 : 3127 : && !(op.code.is_internal_fn ()
8100 : 35 : && conditional_internal_fn_code (internal_fn (op.code))
8101 : : != ERROR_MARK))
8102 : : {
8103 : 10 : if (dump_enabled_p ())
8104 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8105 : : "in-order reduction chain operation not "
8106 : : "supported.\n");
8107 : 10 : return false;
8108 : : }
8109 : 3047 : STMT_VINFO_REDUC_TYPE (reduc_info)
8110 : 3047 : = reduction_type = FOLD_LEFT_REDUCTION;
8111 : : }
8112 : 39251 : else if (!commutative_binary_op_p (orig_code, op.type)
8113 : 39251 : || !associative_binary_op_p (orig_code, op.type))
8114 : : {
8115 : 138 : if (dump_enabled_p ())
8116 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8117 : : "reduction: not commutative/associative\n");
8118 : 138 : return false;
8119 : : }
8120 : : }
8121 : :
8122 : 3047 : if ((reduction_type == COND_REDUCTION
8123 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
8124 : : || reduction_type == CONST_COND_REDUCTION
8125 : 41784 : || reduction_type == EXTRACT_LAST_REDUCTION)
8126 : : && 1
8127 : 689 : && ncopies > 1)
8128 : : {
8129 : 292 : if (dump_enabled_p ())
8130 : 84 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8131 : : "multiple types in condition reduction.\n");
8132 : 292 : return false;
8133 : : }
8134 : :
8135 : 44539 : internal_fn reduc_fn = IFN_LAST;
8136 : 44539 : if (reduction_type == TREE_CODE_REDUCTION
8137 : 44539 : || reduction_type == FOLD_LEFT_REDUCTION
8138 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
8139 : 397 : || reduction_type == CONST_COND_REDUCTION)
8140 : : {
8141 : 39341 : if (reduction_type == FOLD_LEFT_REDUCTION
8142 : 48381 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
8143 : 39341 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8144 : : {
8145 : 43580 : if (reduc_fn != IFN_LAST
8146 : 43580 : && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8147 : : OPTIMIZE_FOR_SPEED))
8148 : : {
8149 : 10837 : if (dump_enabled_p ())
8150 : 754 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8151 : : "reduc op not supported by target.\n");
8152 : :
8153 : 10837 : reduc_fn = IFN_LAST;
8154 : : }
8155 : : }
8156 : : else
8157 : : {
8158 : 676 : if (!nested_cycle || double_reduc)
8159 : : {
8160 : 676 : if (dump_enabled_p ())
8161 : 48 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8162 : : "no reduc code for scalar code.\n");
8163 : :
8164 : 676 : return false;
8165 : : }
8166 : : }
8167 : : }
8168 : 283 : else if (reduction_type == COND_REDUCTION)
8169 : : {
8170 : 283 : int scalar_precision
8171 : 283 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8172 : 283 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
8173 : 283 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8174 : : vectype_out);
8175 : :
8176 : 283 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8177 : : OPTIMIZE_FOR_SPEED))
8178 : 7 : reduc_fn = IFN_REDUC_MAX;
8179 : : }
8180 : 43863 : STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8181 : :
8182 : 43863 : if (reduction_type != EXTRACT_LAST_REDUCTION
8183 : : && (!nested_cycle || double_reduc)
8184 : : && reduc_fn == IFN_LAST
8185 : : && !nunits_out.is_constant ())
8186 : : {
8187 : : if (dump_enabled_p ())
8188 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8189 : : "missing target support for reduction on"
8190 : : " variable-length vectors.\n");
8191 : : return false;
8192 : : }
8193 : :
8194 : : /* For SLP reductions, see if there is a neutral value we can use. */
8195 : 43863 : tree neutral_op = NULL_TREE;
8196 : 43863 : tree initial_value = NULL_TREE;
8197 : 43863 : if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8198 : 272 : initial_value = vect_phi_initial_value (reduc_def_phi);
8199 : 43863 : neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8200 : : orig_code, initial_value);
8201 : :
8202 : 43863 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8203 : : {
8204 : : /* We can't support in-order reductions of code such as this:
8205 : :
8206 : : for (int i = 0; i < n1; ++i)
8207 : : for (int j = 0; j < n2; ++j)
8208 : : l += a[j];
8209 : :
8210 : : since GCC effectively transforms the loop when vectorizing:
8211 : :
8212 : : for (int i = 0; i < n1 / VF; ++i)
8213 : : for (int j = 0; j < n2; ++j)
8214 : : for (int k = 0; k < VF; ++k)
8215 : : l += a[j];
8216 : :
8217 : : which is a reassociation of the original operation. */
8218 : 56 : if (dump_enabled_p ())
8219 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8220 : : "in-order double reduction not supported.\n");
8221 : :
8222 : 56 : return false;
8223 : : }
8224 : :
8225 : 43807 : if (reduction_type == FOLD_LEFT_REDUCTION
8226 : 4183 : && SLP_TREE_LANES (slp_node) > 1
8227 : 43909 : && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8228 : : {
8229 : : /* We cannot use in-order reductions in this case because there is
8230 : : an implicit reassociation of the operations involved. */
8231 : 42 : if (dump_enabled_p ())
8232 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8233 : : "in-order unchained SLP reductions not supported.\n");
8234 : 42 : return false;
8235 : : }
8236 : :
8237 : : /* For double reductions, and for SLP reductions with a neutral value,
8238 : : we construct a variable-length initial vector by loading a vector
8239 : : full of the neutral value and then shift-and-inserting the start
8240 : : values into the low-numbered elements. */
8241 : 43765 : if ((double_reduc || neutral_op)
8242 : : && !nunits_out.is_constant ()
8243 : : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8244 : : vectype_out, OPTIMIZE_FOR_SPEED))
8245 : : {
8246 : : if (dump_enabled_p ())
8247 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8248 : : "reduction on variable-length vectors requires"
8249 : : " target support for a vector-shift-and-insert"
8250 : : " operation.\n");
8251 : : return false;
8252 : : }
8253 : :
8254 : : /* Check extra constraints for variable-length unchained SLP reductions. */
8255 : 43765 : if (!REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8256 : : && !nunits_out.is_constant ())
8257 : : {
8258 : : /* We checked above that we could build the initial vector when
8259 : : there's a neutral element value. Check here for the case in
8260 : : which each SLP statement has its own initial value and in which
8261 : : that value needs to be repeated for every instance of the
8262 : : statement within the initial vector. */
8263 : : unsigned int group_size = SLP_TREE_LANES (slp_node);
8264 : : if (!neutral_op
8265 : : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8266 : : TREE_TYPE (vectype_out)))
8267 : : {
8268 : : if (dump_enabled_p ())
8269 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8270 : : "unsupported form of SLP reduction for"
8271 : : " variable-length vectors: cannot build"
8272 : : " initial vector.\n");
8273 : : return false;
8274 : : }
8275 : : /* The epilogue code relies on the number of elements being a multiple
8276 : : of the group size. The duplicate-and-interleave approach to setting
8277 : : up the initial vector does too. */
8278 : : if (!multiple_p (nunits_out, group_size))
8279 : : {
8280 : : if (dump_enabled_p ())
8281 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8282 : : "unsupported form of SLP reduction for"
8283 : : " variable-length vectors: the vector size"
8284 : : " is not a multiple of the number of results.\n");
8285 : : return false;
8286 : : }
8287 : : }
8288 : :
8289 : 43765 : if (reduction_type == COND_REDUCTION)
8290 : : {
8291 : 283 : widest_int ni;
8292 : :
8293 : 283 : if (! max_loop_iterations (loop, &ni))
8294 : : {
8295 : 0 : if (dump_enabled_p ())
8296 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
8297 : : "loop count not known, cannot create cond "
8298 : : "reduction.\n");
8299 : 0 : return false;
8300 : : }
8301 : : /* Convert backedges to iterations. */
8302 : 283 : ni += 1;
8303 : :
8304 : : /* The additional index will be the same type as the condition. Check
8305 : : that the loop can fit into this less one (because we'll use up the
8306 : : zero slot for when there are no matches). */
8307 : 283 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8308 : 283 : if (wi::geu_p (ni, wi::to_widest (max_index)))
8309 : : {
8310 : 90 : if (dump_enabled_p ())
8311 : 54 : dump_printf_loc (MSG_NOTE, vect_location,
8312 : : "loop size is greater than data size.\n");
8313 : 90 : return false;
8314 : : }
8315 : 283 : }
8316 : :
8317 : : /* In case the vectorization factor (VF) is bigger than the number
8318 : : of elements that we can fit in a vectype (nunits), we have to generate
8319 : : more than one vector stmt - i.e - we need to "unroll" the
8320 : : vector stmt by a factor VF/nunits. For more details see documentation
8321 : : in vectorizable_operation. */
8322 : :
8323 : : /* If the reduction is used in an outer loop we need to generate
8324 : : VF intermediate results, like so (e.g. for ncopies=2):
8325 : : r0 = phi (init, r0)
8326 : : r1 = phi (init, r1)
8327 : : r0 = x0 + r0;
8328 : : r1 = x1 + r1;
8329 : : (i.e. we generate VF results in 2 registers).
8330 : : In this case we have a separate def-use cycle for each copy, and therefore
8331 : : for each copy we get the vector def for the reduction variable from the
8332 : : respective phi node created for this copy.
8333 : :
8334 : : Otherwise (the reduction is unused in the loop nest), we can combine
8335 : : together intermediate results, like so (e.g. for ncopies=2):
8336 : : r = phi (init, r)
8337 : : r = x0 + r;
8338 : : r = x1 + r;
8339 : : (i.e. we generate VF/2 results in a single register).
8340 : : In this case for each copy we get the vector def for the reduction variable
8341 : : from the vectorized reduction operation generated in the previous iteration.
8342 : :
8343 : : This only works when we see both the reduction PHI and its only consumer
8344 : : in vectorizable_reduction and there are no intermediate stmts
8345 : : participating. When unrolling we want each unrolled iteration to have its
8346 : : own reduction accumulator since one of the main goals of unrolling a
8347 : : reduction is to reduce the aggregate loop-carried latency. */
8348 : 43675 : if (ncopies > 1
8349 : 6528 : && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8350 : 6340 : && SLP_TREE_LANES (slp_node) == 1
8351 : 6270 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8352 : 6233 : && reduc_chain_length == 1
8353 : 49726 : && loop_vinfo->suggested_unroll_factor == 1)
8354 : 43675 : single_defuse_cycle = true;
8355 : :
8356 : 43675 : if (single_defuse_cycle && !lane_reducing)
8357 : : {
8358 : 5763 : gcc_assert (op.code != COND_EXPR);
8359 : :
8360 : : /* 4. check support for the operation in the loop
8361 : :
8362 : : This isn't necessary for the lane reduction codes, since they
8363 : : can only be produced by pattern matching, and it's up to the
8364 : : pattern matcher to test for support. The main reason for
8365 : : specifically skipping this step is to avoid rechecking whether
8366 : : mixed-sign dot-products can be implemented using signed
8367 : : dot-products. */
8368 : 5763 : machine_mode vec_mode = TYPE_MODE (vectype_in);
8369 : 5763 : if (!directly_supported_p (op.code, vectype_in, optab_vector))
8370 : : {
8371 : 2432 : if (dump_enabled_p ())
8372 : 18 : dump_printf (MSG_NOTE, "op not supported by target.\n");
8373 : 4928 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8374 : 2432 : || !vect_can_vectorize_without_simd_p (op.code))
8375 : : single_defuse_cycle = false;
8376 : : else
8377 : 1281 : if (dump_enabled_p ())
8378 : 8 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8379 : : }
8380 : :
8381 : 5763 : if (vect_emulated_vector_p (vectype_in)
8382 : 5763 : && !vect_can_vectorize_without_simd_p (op.code))
8383 : : {
8384 : 0 : if (dump_enabled_p ())
8385 : 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
8386 : 0 : return false;
8387 : : }
8388 : : }
8389 : 43675 : if (dump_enabled_p () && single_defuse_cycle)
8390 : 633 : dump_printf_loc (MSG_NOTE, vect_location,
8391 : : "using single def-use cycle for reduction by reducing "
8392 : : "multiple vectors to one in the loop body\n");
8393 : 43675 : STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8394 : :
8395 : : /* For lane-reducing operation, the below processing related to single
8396 : : defuse-cycle will be done in its own vectorizable function. One more
8397 : : thing to note is that the operation must not be involved in fold-left
8398 : : reduction. */
8399 : 43675 : single_defuse_cycle &= !lane_reducing;
8400 : :
8401 : 43675 : if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
8402 : 27407 : for (i = 0; i < (int) op.num_ops; i++)
8403 : 18706 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8404 : : {
8405 : 0 : if (dump_enabled_p ())
8406 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8407 : : "incompatible vector types for invariants\n");
8408 : 0 : return false;
8409 : : }
8410 : :
8411 : 43675 : vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8412 : : reduction_type, ncopies, cost_vec);
8413 : : /* Cost the reduction op inside the loop if transformed via
8414 : : vect_transform_reduction for non-lane-reducing operation. Otherwise
8415 : : this is costed by the separate vectorizable_* routines. */
8416 : 43675 : if (single_defuse_cycle)
8417 : 4612 : record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
8418 : :
8419 : 43675 : if (dump_enabled_p ()
8420 : 43675 : && reduction_type == FOLD_LEFT_REDUCTION)
8421 : 200 : dump_printf_loc (MSG_NOTE, vect_location,
8422 : : "using an in-order (fold-left) reduction.\n");
8423 : 43675 : STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8424 : :
8425 : : /* All but single defuse-cycle optimized and fold-left reductions go
8426 : : through their own vectorizable_* routines. */
8427 : 43675 : if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
8428 : : {
8429 : 34974 : stmt_vec_info tem
8430 : 34974 : = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8431 : 34974 : if (REDUC_GROUP_FIRST_ELEMENT (tem))
8432 : : {
8433 : 212 : gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8434 : : tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8435 : : }
8436 : 34974 : STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8437 : 34974 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8438 : : }
8439 : 8701 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8440 : 4 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
8441 : : slp_node, op.code, op.type,
8442 : : vectype_in);
8443 : : return true;
8444 : : }
8445 : :
8446 : : /* STMT_INFO is a dot-product reduction whose multiplication operands
8447 : : have different signs. Emit a sequence to emulate the operation
8448 : : using a series of signed DOT_PROD_EXPRs and return the last
8449 : : statement generated. VEC_DEST is the result of the vector operation
8450 : : and VOP lists its inputs. */
8451 : :
8452 : : static gassign *
8453 : 2 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8454 : : gimple_stmt_iterator *gsi, tree vec_dest,
8455 : : tree vop[3])
8456 : : {
8457 : 2 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8458 : 2 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8459 : 2 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
8460 : 2 : gimple *new_stmt;
8461 : :
8462 : : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8463 : 2 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8464 : 0 : std::swap (vop[0], vop[1]);
8465 : :
8466 : : /* Convert all inputs to signed types. */
8467 : 8 : for (int i = 0; i < 3; ++i)
8468 : 6 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8469 : : {
8470 : 2 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8471 : 2 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8472 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8473 : 2 : vop[i] = tmp;
8474 : : }
8475 : :
8476 : : /* In the comments below we assume 8-bit inputs for simplicity,
8477 : : but the approach works for any full integer type. */
8478 : :
8479 : : /* Create a vector of -128. */
8480 : 2 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8481 : 2 : tree min_narrow = build_vector_from_val (narrow_vectype,
8482 : : min_narrow_elttype);
8483 : :
8484 : : /* Create a vector of 64. */
8485 : 2 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8486 : 2 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8487 : 2 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8488 : :
8489 : : /* Emit: SUB_RES = VOP[0] - 128. */
8490 : 2 : tree sub_res = make_ssa_name (narrow_vectype);
8491 : 2 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8492 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8493 : :
8494 : : /* Emit:
8495 : :
8496 : : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8497 : : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8498 : : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8499 : :
8500 : : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8501 : : Doing the two 64 * y steps first allows more time to compute x. */
8502 : 2 : tree stage1 = make_ssa_name (wide_vectype);
8503 : 2 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8504 : : vop[1], half_narrow, vop[2]);
8505 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8506 : :
8507 : 2 : tree stage2 = make_ssa_name (wide_vectype);
8508 : 2 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8509 : : vop[1], half_narrow, stage1);
8510 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8511 : :
8512 : 2 : tree stage3 = make_ssa_name (wide_vectype);
8513 : 2 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8514 : : sub_res, vop[1], stage2);
8515 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8516 : :
8517 : : /* Convert STAGE3 to the reduction type. */
8518 : 2 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8519 : 2 : }
8520 : :
8521 : : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8522 : : value. */
8523 : :
8524 : : bool
8525 : 2297 : vect_transform_reduction (loop_vec_info loop_vinfo,
8526 : : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8527 : : slp_tree slp_node)
8528 : : {
8529 : 2297 : tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8530 : 2297 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8531 : 2297 : unsigned vec_num;
8532 : :
8533 : 2297 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8534 : 2297 : gcc_assert (reduc_info->is_reduc_info);
8535 : :
8536 : 2297 : if (nested_in_vect_loop_p (loop, stmt_info))
8537 : : {
8538 : 0 : loop = loop->inner;
8539 : 0 : gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8540 : : }
8541 : :
8542 : 2297 : gimple_match_op op;
8543 : 2297 : if (!gimple_extract_op (stmt_info->stmt, &op))
8544 : 0 : gcc_unreachable ();
8545 : :
8546 : : /* All uses but the last are expected to be defined in the loop.
8547 : : The last use is the reduction variable. In case of nested cycle this
8548 : : assumption is not true: we use reduc_index to record the index of the
8549 : : reduction variable. */
8550 : 2297 : stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8551 : 2297 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8552 : 2297 : int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8553 : 2297 : tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
8554 : :
8555 : 2297 : if (!vectype_in)
8556 : 2060 : vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8557 : :
8558 : 2297 : vec_num = vect_get_num_copies (loop_vinfo, slp_node, vectype_in);
8559 : :
8560 : 2297 : code_helper code = canonicalize_code (op.code, op.type);
8561 : 2297 : internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8562 : :
8563 : 2297 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8564 : 2297 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8565 : 2297 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8566 : :
8567 : : /* Transform. */
8568 : 2297 : tree new_temp = NULL_TREE;
8569 : 16079 : auto_vec<tree> vec_oprnds[3];
8570 : :
8571 : 2297 : if (dump_enabled_p ())
8572 : 685 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8573 : :
8574 : : /* A binary COND_OP reduction must have the same definition and else
8575 : : value. */
8576 : 2572 : bool cond_fn_p = code.is_internal_fn ()
8577 : 275 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8578 : 275 : if (cond_fn_p)
8579 : : {
8580 : 275 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8581 : : || code == IFN_COND_MUL || code == IFN_COND_AND
8582 : : || code == IFN_COND_IOR || code == IFN_COND_XOR
8583 : : || code == IFN_COND_MIN || code == IFN_COND_MAX);
8584 : 275 : gcc_assert (op.num_ops == 4
8585 : : && (op.ops[reduc_index]
8586 : : == op.ops[internal_fn_else_index ((internal_fn) code)]));
8587 : : }
8588 : :
8589 : 2297 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8590 : :
8591 : 2297 : vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8592 : 2297 : if (reduction_type == FOLD_LEFT_REDUCTION)
8593 : : {
8594 : 834 : internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8595 : 834 : gcc_assert (code.is_tree_code () || cond_fn_p);
8596 : 834 : return vectorize_fold_left_reduction
8597 : 834 : (loop_vinfo, stmt_info, gsi, slp_node, reduc_def_phi,
8598 : 834 : code, reduc_fn, op.num_ops, vectype_in,
8599 : 834 : reduc_index, masks, lens);
8600 : : }
8601 : :
8602 : 1463 : bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8603 : 1463 : bool lane_reducing = lane_reducing_op_p (code);
8604 : 1226 : gcc_assert (single_defuse_cycle || lane_reducing);
8605 : :
8606 : 1463 : if (lane_reducing)
8607 : : {
8608 : : /* The last operand of lane-reducing op is for reduction. */
8609 : 237 : gcc_assert (reduc_index == (int) op.num_ops - 1);
8610 : : }
8611 : :
8612 : : /* Create the destination vector */
8613 : 1463 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8614 : 1463 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8615 : :
8616 : : /* Get NCOPIES vector definitions for all operands except the reduction
8617 : : definition. */
8618 : 1463 : if (!cond_fn_p)
8619 : : {
8620 : 1195 : gcc_assert (reduc_index >= 0 && reduc_index <= 2);
8621 : 1973 : vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
8622 : 1195 : single_defuse_cycle && reduc_index == 0
8623 : : ? NULL_TREE : op.ops[0], &vec_oprnds[0],
8624 : 1195 : single_defuse_cycle && reduc_index == 1
8625 : : ? NULL_TREE : op.ops[1], &vec_oprnds[1],
8626 : 1195 : op.num_ops == 3
8627 : 237 : && !(single_defuse_cycle && reduc_index == 2)
8628 : : ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
8629 : : }
8630 : : else
8631 : : {
8632 : : /* For a conditional operation pass the truth type as mask
8633 : : vectype. */
8634 : 268 : gcc_assert (single_defuse_cycle
8635 : : && (reduc_index == 1 || reduc_index == 2));
8636 : 268 : vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1, op.ops[0],
8637 : : truth_type_for (vectype_in), &vec_oprnds[0],
8638 : : reduc_index == 1 ? NULL_TREE : op.ops[1],
8639 : : NULL_TREE, &vec_oprnds[1],
8640 : : reduc_index == 2 ? NULL_TREE : op.ops[2],
8641 : : NULL_TREE, &vec_oprnds[2]);
8642 : : }
8643 : :
8644 : : /* For single def-use cycles get one copy of the vectorized reduction
8645 : : definition. */
8646 : 1463 : if (single_defuse_cycle)
8647 : : {
8648 : 1415 : vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
8649 : : reduc_index == 0 ? op.ops[0] : NULL_TREE,
8650 : : &vec_oprnds[0],
8651 : : reduc_index == 1 ? op.ops[1] : NULL_TREE,
8652 : : &vec_oprnds[1],
8653 : : reduc_index == 2 ? op.ops[2] : NULL_TREE,
8654 : : &vec_oprnds[2]);
8655 : : }
8656 : 48 : else if (lane_reducing)
8657 : : {
8658 : : /* For normal reduction, consistency between vectorized def/use is
8659 : : naturally ensured when mapping from scalar statement. But if lane-
8660 : : reducing op is involved in reduction, thing would become somewhat
8661 : : complicated in that the op's result and operand for accumulation are
8662 : : limited to less lanes than other operands, which certainly causes
8663 : : def/use mismatch on adjacent statements around the op if do not have
8664 : : any kind of specific adjustment. One approach is to refit lane-
8665 : : reducing op in the way of introducing new trivial pass-through copies
8666 : : to fix possible def/use gap, so as to make it behave like a normal op.
8667 : : And vector reduction PHIs are always generated to the full extent, no
8668 : : matter lane-reducing op exists or not. If some copies or PHIs are
8669 : : actually superfluous, they would be cleaned up by passes after
8670 : : vectorization. An example for single-lane slp, lane-reducing ops
8671 : : with mixed input vectypes in a reduction chain, is given as below.
8672 : : Similarly, this handling is applicable for multiple-lane slp as well.
8673 : :
8674 : : int sum = 1;
8675 : : for (i)
8676 : : {
8677 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
8678 : : sum += w[i]; // widen-sum <vector(16) char>
8679 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8680 : : sum += n[i]; // normal <vector(4) int>
8681 : : }
8682 : :
8683 : : The vector size is 128-bit,vectorization factor is 16. Reduction
8684 : : statements would be transformed as:
8685 : :
8686 : : vector<4> int sum_v0 = { 0, 0, 0, 1 };
8687 : : vector<4> int sum_v1 = { 0, 0, 0, 0 };
8688 : : vector<4> int sum_v2 = { 0, 0, 0, 0 };
8689 : : vector<4> int sum_v3 = { 0, 0, 0, 0 };
8690 : :
8691 : : for (i / 16)
8692 : : {
8693 : : sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8694 : : sum_v1 = sum_v1; // copy
8695 : : sum_v2 = sum_v2; // copy
8696 : : sum_v3 = sum_v3; // copy
8697 : :
8698 : : sum_v0 = sum_v0; // copy
8699 : : sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8700 : : sum_v2 = sum_v2; // copy
8701 : : sum_v3 = sum_v3; // copy
8702 : :
8703 : : sum_v0 = sum_v0; // copy
8704 : : sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8705 : : sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8706 : : sum_v3 = sum_v3; // copy
8707 : :
8708 : : sum_v0 += n_v0[i: 0 ~ 3 ];
8709 : : sum_v1 += n_v1[i: 4 ~ 7 ];
8710 : : sum_v2 += n_v2[i: 8 ~ 11];
8711 : : sum_v3 += n_v3[i: 12 ~ 15];
8712 : : }
8713 : :
8714 : : Moreover, for a higher instruction parallelism in final vectorized
8715 : : loop, it is considered to make those effective vector lane-reducing
8716 : : ops be distributed evenly among all def-use cycles. In the above
8717 : : example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8718 : : cycles, instruction dependency among them could be eliminated. */
8719 : 48 : unsigned effec_ncopies = vec_oprnds[0].length ();
8720 : 48 : unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8721 : :
8722 : 48 : gcc_assert (effec_ncopies <= total_ncopies);
8723 : :
8724 : 48 : if (effec_ncopies < total_ncopies)
8725 : : {
8726 : 144 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8727 : : {
8728 : 192 : gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8729 : 96 : vec_oprnds[i].safe_grow_cleared (total_ncopies);
8730 : : }
8731 : : }
8732 : :
8733 : 48 : tree reduc_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8734 : 48 : gcc_assert (reduc_vectype_in);
8735 : :
8736 : 48 : unsigned effec_reduc_ncopies
8737 : 48 : = vect_get_num_copies (loop_vinfo, slp_node, reduc_vectype_in);
8738 : :
8739 : 48 : gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8740 : :
8741 : 48 : if (effec_ncopies < effec_reduc_ncopies)
8742 : : {
8743 : : /* Find suitable def-use cycles to generate vectorized statements
8744 : : into, and reorder operands based on the selection. */
8745 : 25 : unsigned curr_pos = reduc_info->reduc_result_pos;
8746 : 25 : unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8747 : :
8748 : 25 : gcc_assert (curr_pos < effec_reduc_ncopies);
8749 : 25 : reduc_info->reduc_result_pos = next_pos;
8750 : :
8751 : 25 : if (curr_pos)
8752 : : {
8753 : 14 : unsigned count = effec_reduc_ncopies - effec_ncopies;
8754 : 14 : unsigned start = curr_pos - count;
8755 : :
8756 : 14 : if ((int) start < 0)
8757 : : {
8758 : 11 : count = curr_pos;
8759 : 11 : start = 0;
8760 : : }
8761 : :
8762 : 42 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8763 : : {
8764 : 68 : for (unsigned j = effec_ncopies; j > start; j--)
8765 : : {
8766 : 40 : unsigned k = j - 1;
8767 : 40 : std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8768 : 40 : gcc_assert (!vec_oprnds[i][k]);
8769 : : }
8770 : : }
8771 : : }
8772 : : }
8773 : : }
8774 : :
8775 : 1463 : bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
8776 : 2433 : unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8777 : 1463 : unsigned mask_index = 0;
8778 : :
8779 : 6001 : for (unsigned i = 0; i < num; ++i)
8780 : : {
8781 : 4538 : gimple *new_stmt;
8782 : 4538 : tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8783 : 4538 : if (!vop[0] || !vop[1])
8784 : : {
8785 : 168 : tree reduc_vop = vec_oprnds[reduc_index][i];
8786 : :
8787 : : /* If could not generate an effective vector statement for current
8788 : : portion of reduction operand, insert a trivial copy to simply
8789 : : handle over the operand to other dependent statements. */
8790 : 168 : gcc_assert (reduc_vop);
8791 : :
8792 : 168 : if (TREE_CODE (reduc_vop) == SSA_NAME
8793 : 168 : && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8794 : 168 : new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8795 : : else
8796 : : {
8797 : 0 : new_temp = make_ssa_name (vec_dest);
8798 : 0 : new_stmt = gimple_build_assign (new_temp, reduc_vop);
8799 : 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8800 : : gsi);
8801 : : }
8802 : : }
8803 : 4370 : else if (masked_loop_p && !mask_by_cond_expr)
8804 : : {
8805 : : /* No conditional ifns have been defined for lane-reducing op
8806 : : yet. */
8807 : 8 : gcc_assert (!lane_reducing);
8808 : :
8809 : : /* Make sure that the reduction accumulator is vop[0]. */
8810 : 8 : if (reduc_index == 1)
8811 : : {
8812 : 8 : gcc_assert (commutative_binary_op_p (code, op.type));
8813 : 8 : std::swap (vop[0], vop[1]);
8814 : : }
8815 : 8 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8816 : : vec_num, vectype_in,
8817 : : mask_index++);
8818 : 8 : gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8819 : : vop[0], vop[1], vop[0]);
8820 : 8 : new_temp = make_ssa_name (vec_dest, call);
8821 : 8 : gimple_call_set_lhs (call, new_temp);
8822 : 8 : gimple_call_set_nothrow (call, true);
8823 : 8 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8824 : 8 : new_stmt = call;
8825 : : }
8826 : : else
8827 : : {
8828 : 4362 : if (op.num_ops >= 3)
8829 : 988 : vop[2] = vec_oprnds[2][i];
8830 : :
8831 : 4362 : if (masked_loop_p && mask_by_cond_expr)
8832 : : {
8833 : 4 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8834 : : vec_num, vectype_in,
8835 : : mask_index++);
8836 : 4 : build_vect_cond_expr (code, vop, mask, gsi);
8837 : : }
8838 : :
8839 : 4362 : if (emulated_mixed_dot_prod)
8840 : 2 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8841 : : vec_dest, vop);
8842 : :
8843 : 5074 : else if (code.is_internal_fn () && !cond_fn_p)
8844 : 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8845 : : op.num_ops,
8846 : : vop[0], vop[1], vop[2]);
8847 : 5074 : else if (code.is_internal_fn () && cond_fn_p)
8848 : 714 : new_stmt = gimple_build_call_internal (internal_fn (code),
8849 : : op.num_ops,
8850 : : vop[0], vop[1], vop[2],
8851 : : vop[reduc_index]);
8852 : : else
8853 : 3646 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8854 : : vop[0], vop[1], vop[2]);
8855 : 4362 : new_temp = make_ssa_name (vec_dest, new_stmt);
8856 : 4362 : gimple_set_lhs (new_stmt, new_temp);
8857 : 4362 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8858 : : }
8859 : :
8860 : 4538 : if (single_defuse_cycle && i < num - 1)
8861 : 2883 : vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8862 : : else
8863 : 1655 : slp_node->push_vec_def (new_stmt);
8864 : : }
8865 : :
8866 : : return true;
8867 : 9188 : }
8868 : :
8869 : : /* Transform phase of a cycle PHI. */
8870 : :
8871 : : bool
8872 : 22376 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8873 : : stmt_vec_info stmt_info,
8874 : : slp_tree slp_node, slp_instance slp_node_instance)
8875 : : {
8876 : 22376 : tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8877 : 22376 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8878 : 22376 : int i;
8879 : 22376 : bool nested_cycle = false;
8880 : 22376 : int vec_num;
8881 : :
8882 : 22475 : if (nested_in_vect_loop_p (loop, stmt_info))
8883 : : {
8884 : : loop = loop->inner;
8885 : : nested_cycle = true;
8886 : : }
8887 : :
8888 : 22376 : stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8889 : 22376 : reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8890 : 22376 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8891 : 22376 : gcc_assert (reduc_info->is_reduc_info);
8892 : :
8893 : 22376 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8894 : 22376 : || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8895 : : /* Leave the scalar phi in place. */
8896 : : return true;
8897 : :
8898 : 21542 : vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8899 : :
8900 : : /* Check whether we should use a single PHI node and accumulate
8901 : : vectors to one before the backedge. */
8902 : 21542 : if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8903 : 1415 : vec_num = 1;
8904 : :
8905 : : /* Create the destination vector */
8906 : 21542 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8907 : 21542 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8908 : : vectype_out);
8909 : :
8910 : : /* Get the loop-entry arguments. */
8911 : 21542 : tree vec_initial_def = NULL_TREE;
8912 : 21542 : auto_vec<tree> vec_initial_defs;
8913 : 21542 : vec_initial_defs.reserve (vec_num);
8914 : : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8915 : : and we can't use zero for induc_val, use initial_def. Similarly
8916 : : for REDUC_MIN and initial_def larger than the base. */
8917 : 21542 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8918 : : {
8919 : 66 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8920 : 66 : tree initial_def = vect_phi_initial_value (phi);
8921 : 66 : reduc_info->reduc_initial_values.safe_push (initial_def);
8922 : 66 : tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8923 : 66 : if (TREE_CODE (initial_def) == INTEGER_CST
8924 : 64 : && !integer_zerop (induc_val)
8925 : 130 : && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8926 : 44 : && tree_int_cst_lt (initial_def, induc_val))
8927 : 61 : || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8928 : 20 : && tree_int_cst_lt (induc_val, initial_def))))
8929 : : {
8930 : 3 : induc_val = initial_def;
8931 : : /* Communicate we used the initial_def to epilouge
8932 : : generation. */
8933 : 3 : STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8934 : : }
8935 : 66 : vec_initial_defs.quick_push
8936 : 66 : (build_vector_from_val (vectype_out, induc_val));
8937 : : }
8938 : 21476 : else if (nested_cycle)
8939 : : {
8940 : 418 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8941 : 418 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8942 : : &vec_initial_defs);
8943 : : }
8944 : : else
8945 : : {
8946 : 21058 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8947 : 21058 : vec<tree> &initial_values = reduc_info->reduc_initial_values;
8948 : 21058 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8949 : :
8950 : 21058 : unsigned int num_phis = stmts.length ();
8951 : 21058 : if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8952 : 139 : num_phis = 1;
8953 : 21058 : initial_values.reserve (num_phis);
8954 : 42419 : for (unsigned int i = 0; i < num_phis; ++i)
8955 : : {
8956 : 21361 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8957 : 21361 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8958 : : }
8959 : 21058 : if (vec_num == 1)
8960 : 20885 : vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8961 : 21058 : if (!initial_values.is_empty ())
8962 : : {
8963 : 20877 : tree initial_value
8964 : 41579 : = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8965 : 20877 : code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8966 : 20877 : tree neutral_op
8967 : 20877 : = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8968 : : code, initial_value);
8969 : : /* Try to simplify the vector initialization by applying an
8970 : : adjustment after the reduction has been performed. This
8971 : : can also break a critical path but on the other hand
8972 : : requires to keep the initial value live across the loop. */
8973 : 20877 : if (neutral_op
8974 : 20790 : && initial_values.length () == 1
8975 : 20629 : && !reduc_info->reused_accumulator
8976 : 16758 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8977 : 37569 : && !operand_equal_p (neutral_op, initial_values[0]))
8978 : : {
8979 : 12033 : STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8980 : 12033 : = initial_values[0];
8981 : 12033 : initial_values[0] = neutral_op;
8982 : : }
8983 : 41754 : get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8984 : : &vec_initial_defs, vec_num,
8985 : : stmts.length (), neutral_op);
8986 : : }
8987 : : }
8988 : :
8989 : 21542 : if (vec_initial_def)
8990 : : {
8991 : 0 : vec_initial_defs.create (1);
8992 : 0 : vec_initial_defs.quick_push (vec_initial_def);
8993 : : }
8994 : :
8995 : 21542 : if (auto *accumulator = reduc_info->reused_accumulator)
8996 : : {
8997 : 4066 : tree def = accumulator->reduc_input;
8998 : 4066 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8999 : : {
9000 : 4064 : unsigned int nreduc;
9001 : 8128 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
9002 : 4064 : (TREE_TYPE (def)),
9003 : 4064 : TYPE_VECTOR_SUBPARTS (vectype_out),
9004 : : &nreduc);
9005 : 0 : gcc_assert (res);
9006 : 4064 : gimple_seq stmts = NULL;
9007 : : /* Reduce the single vector to a smaller one. */
9008 : 4064 : if (nreduc != 1)
9009 : : {
9010 : : /* Perform the reduction in the appropriate type. */
9011 : 4064 : tree rvectype = vectype_out;
9012 : 4064 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
9013 : 4064 : TREE_TYPE (TREE_TYPE (def))))
9014 : 221 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
9015 : : TYPE_VECTOR_SUBPARTS
9016 : 442 : (vectype_out));
9017 : 4064 : def = vect_create_partial_epilog (def, rvectype,
9018 : : STMT_VINFO_REDUC_CODE
9019 : : (reduc_info),
9020 : : &stmts);
9021 : : }
9022 : : /* The epilogue loop might use a different vector mode, like
9023 : : VNx2DI vs. V2DI. */
9024 : 4064 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
9025 : : {
9026 : 0 : tree reduc_type = build_vector_type_for_mode
9027 : 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
9028 : 0 : def = gimple_convert (&stmts, reduc_type, def);
9029 : : }
9030 : : /* Adjust the input so we pick up the partially reduced value
9031 : : for the skip edge in vect_create_epilog_for_reduction. */
9032 : 4064 : accumulator->reduc_input = def;
9033 : : /* And the reduction could be carried out using a different sign. */
9034 : 4064 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
9035 : 221 : def = gimple_convert (&stmts, vectype_out, def);
9036 : 4064 : edge e;
9037 : 4064 : if ((e = loop_vinfo->main_loop_edge)
9038 : 4064 : || (e = loop_vinfo->skip_this_loop_edge))
9039 : : {
9040 : : /* While we'd like to insert on the edge this will split
9041 : : blocks and disturb bookkeeping, we also will eventually
9042 : : need this on the skip edge. Rely on sinking to
9043 : : fixup optimal placement and insert in the pred. */
9044 : 3883 : gimple_stmt_iterator gsi = gsi_last_bb (e->src);
9045 : : /* Insert before a cond that eventually skips the
9046 : : epilogue. */
9047 : 3883 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
9048 : 3870 : gsi_prev (&gsi);
9049 : 3883 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
9050 : : }
9051 : : else
9052 : 181 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
9053 : : stmts);
9054 : : }
9055 : 4066 : if (loop_vinfo->main_loop_edge)
9056 : 3885 : vec_initial_defs[0]
9057 : 3885 : = vect_get_main_loop_result (loop_vinfo, def,
9058 : 3885 : vec_initial_defs[0]);
9059 : : else
9060 : 181 : vec_initial_defs.safe_push (def);
9061 : : }
9062 : :
9063 : : /* Generate the reduction PHIs upfront. */
9064 : 43748 : for (i = 0; i < vec_num; i++)
9065 : : {
9066 : 22206 : tree vec_init_def = vec_initial_defs[i];
9067 : : /* Create the reduction-phi that defines the reduction
9068 : : operand. */
9069 : 22206 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
9070 : 22206 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
9071 : : UNKNOWN_LOCATION);
9072 : :
9073 : : /* The loop-latch arg is set in epilogue processing. */
9074 : :
9075 : 22206 : slp_node->push_vec_def (new_phi);
9076 : : }
9077 : :
9078 : 21542 : return true;
9079 : 21542 : }
9080 : :
9081 : : /* Vectorizes LC PHIs. */
9082 : :
9083 : : bool
9084 : 168501 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
9085 : : stmt_vec_info stmt_info,
9086 : : slp_tree slp_node)
9087 : : {
9088 : 168501 : if (!loop_vinfo
9089 : 168501 : || !is_a <gphi *> (stmt_info->stmt)
9090 : 202489 : || gimple_phi_num_args (stmt_info->stmt) != 1)
9091 : : return false;
9092 : :
9093 : 837 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
9094 : 109 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
9095 : : return false;
9096 : :
9097 : : /* Deal with copies from externs or constants that disguise as
9098 : : loop-closed PHI nodes (PR97886). */
9099 : 837 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
9100 : : SLP_TREE_VECTYPE (slp_node)))
9101 : : {
9102 : 0 : if (dump_enabled_p ())
9103 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9104 : : "incompatible vector types for invariants\n");
9105 : 0 : return false;
9106 : : }
9107 : 837 : STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
9108 : 837 : return true;
9109 : : }
9110 : :
9111 : : bool
9112 : 348 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
9113 : : stmt_vec_info stmt_info,
9114 : : slp_tree slp_node)
9115 : : {
9116 : :
9117 : 348 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9118 : 348 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9119 : 348 : basic_block bb = gimple_bb (stmt_info->stmt);
9120 : 348 : edge e = single_pred_edge (bb);
9121 : 348 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9122 : 348 : auto_vec<tree> vec_oprnds;
9123 : 696 : vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
9124 : 348 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9125 : 760 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
9126 : : {
9127 : : /* Create the vectorized LC PHI node. */
9128 : 412 : gphi *new_phi = create_phi_node (vec_dest, bb);
9129 : 412 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9130 : 412 : slp_node->push_vec_def (new_phi);
9131 : : }
9132 : :
9133 : 348 : return true;
9134 : 348 : }
9135 : :
9136 : : /* Vectorizes PHIs. */
9137 : :
9138 : : bool
9139 : 165481 : vectorizable_phi (vec_info *,
9140 : : stmt_vec_info stmt_info, gimple **vec_stmt,
9141 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9142 : : {
9143 : 165481 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9144 : : return false;
9145 : :
9146 : 81206 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9147 : : return false;
9148 : :
9149 : 81206 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9150 : :
9151 : 81206 : if (!vec_stmt) /* transformation not required. */
9152 : : {
9153 : : slp_tree child;
9154 : : unsigned i;
9155 : 212526 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9156 : 144456 : if (!child)
9157 : : {
9158 : 0 : if (dump_enabled_p ())
9159 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9160 : : "PHI node with unvectorized backedge def\n");
9161 : 0 : return false;
9162 : : }
9163 : 144456 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9164 : : {
9165 : 25 : if (dump_enabled_p ())
9166 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9167 : : "incompatible vector types for invariants\n");
9168 : 25 : return false;
9169 : : }
9170 : 144431 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9171 : 144431 : && !useless_type_conversion_p (vectype,
9172 : : SLP_TREE_VECTYPE (child)))
9173 : : {
9174 : : /* With bools we can have mask and non-mask precision vectors
9175 : : or different non-mask precisions. while pattern recog is
9176 : : supposed to guarantee consistency here bugs in it can cause
9177 : : mismatches (PR103489 and PR103800 for example).
9178 : : Deal with them here instead of ICEing later. */
9179 : 16 : if (dump_enabled_p ())
9180 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9181 : : "incompatible vector type setup from "
9182 : : "bool pattern detection\n");
9183 : 16 : return false;
9184 : : }
9185 : :
9186 : : /* For single-argument PHIs assume coalescing which means zero cost
9187 : : for the scalar and the vector PHIs. This avoids artificially
9188 : : favoring the vector path (but may pessimize it in some cases). */
9189 : 68070 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9190 : 57020 : record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9191 : : vector_stmt, stmt_info, vectype, 0, vect_body);
9192 : 68070 : STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9193 : 68070 : return true;
9194 : : }
9195 : :
9196 : 13095 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9197 : 13095 : basic_block bb = gimple_bb (stmt_info->stmt);
9198 : 13095 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9199 : 13095 : auto_vec<gphi *> new_phis;
9200 : 44503 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9201 : : {
9202 : 31408 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9203 : :
9204 : : /* Skip not yet vectorized defs. */
9205 : 31790 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9206 : 31408 : && SLP_TREE_VEC_DEFS (child).is_empty ())
9207 : 382 : continue;
9208 : :
9209 : 31026 : auto_vec<tree> vec_oprnds;
9210 : 31026 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9211 : 31026 : if (!new_phis.exists ())
9212 : : {
9213 : 13095 : new_phis.create (vec_oprnds.length ());
9214 : 27776 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
9215 : : {
9216 : : /* Create the vectorized LC PHI node. */
9217 : 14681 : new_phis.quick_push (create_phi_node (vec_dest, bb));
9218 : 14681 : slp_node->push_vec_def (new_phis[j]);
9219 : : }
9220 : : }
9221 : 31026 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9222 : 67195 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
9223 : 36169 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9224 : 31026 : }
9225 : : /* We should have at least one already vectorized child. */
9226 : 13095 : gcc_assert (new_phis.exists ());
9227 : :
9228 : 13095 : return true;
9229 : 13095 : }
9230 : :
9231 : : /* Vectorizes first order recurrences. An overview of the transformation
9232 : : is described below. Suppose we have the following loop.
9233 : :
9234 : : int t = 0;
9235 : : for (int i = 0; i < n; ++i)
9236 : : {
9237 : : b[i] = a[i] - t;
9238 : : t = a[i];
9239 : : }
9240 : :
9241 : : There is a first-order recurrence on 'a'. For this loop, the scalar IR
9242 : : looks (simplified) like:
9243 : :
9244 : : scalar.preheader:
9245 : : init = 0;
9246 : :
9247 : : scalar.body:
9248 : : i = PHI <0(scalar.preheader), i+1(scalar.body)>
9249 : : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9250 : : _1 = a[i]
9251 : : b[i] = _1 - _2
9252 : : if (i < n) goto scalar.body
9253 : :
9254 : : In this example, _2 is a recurrence because it's value depends on the
9255 : : previous iteration. We vectorize this as (VF = 4)
9256 : :
9257 : : vector.preheader:
9258 : : vect_init = vect_cst(..., ..., ..., 0)
9259 : :
9260 : : vector.body
9261 : : i = PHI <0(vector.preheader), i+4(vector.body)>
9262 : : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9263 : : vect_2 = a[i, i+1, i+2, i+3];
9264 : : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9265 : : b[i, i+1, i+2, i+3] = vect_2 - vect_3
9266 : : if (..) goto vector.body
9267 : :
9268 : : In this function, vectorizable_recurr, we code generate both the
9269 : : vector PHI node and the permute since those together compute the
9270 : : vectorized value of the scalar PHI. We do not yet have the
9271 : : backedge value to fill in there nor into the vec_perm. Those
9272 : : are filled in vect_schedule_scc.
9273 : :
9274 : : TODO: Since the scalar loop does not have a use of the recurrence
9275 : : outside of the loop the natural way to implement peeling via
9276 : : vectorizing the live value doesn't work. For now peeling of loops
9277 : : with a recurrence is not implemented. For SLP the supported cases
9278 : : are restricted to those requiring a single vector recurrence PHI. */
9279 : :
9280 : : bool
9281 : 167696 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9282 : : gimple **vec_stmt, slp_tree slp_node,
9283 : : stmt_vector_for_cost *cost_vec)
9284 : : {
9285 : 167696 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9286 : : return false;
9287 : :
9288 : 33183 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
9289 : :
9290 : : /* So far we only support first-order recurrence auto-vectorization. */
9291 : 33183 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9292 : : return false;
9293 : :
9294 : 262 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9295 : 262 : unsigned ncopies;
9296 : 262 : if (slp_node)
9297 : 262 : ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9298 : : else
9299 : 0 : ncopies = vect_get_num_copies (loop_vinfo, vectype);
9300 : 262 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9301 : 262 : unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9302 : : /* We need to be able to make progress with a single vector. */
9303 : 262 : if (maybe_gt (dist * 2, nunits))
9304 : : {
9305 : 0 : if (dump_enabled_p ())
9306 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9307 : : "first order recurrence exceeds half of "
9308 : : "a vector\n");
9309 : 0 : return false;
9310 : : }
9311 : :
9312 : : /* First-order recurrence autovectorization needs to handle permutation
9313 : : with indices = [nunits-1, nunits, nunits+1, ...]. */
9314 : 262 : vec_perm_builder sel (nunits, 1, 3);
9315 : 1048 : for (int i = 0; i < 3; ++i)
9316 : 786 : sel.quick_push (nunits - dist + i);
9317 : 262 : vec_perm_indices indices (sel, 2, nunits);
9318 : :
9319 : 262 : if (!vec_stmt) /* transformation not required. */
9320 : : {
9321 : 230 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9322 : : indices))
9323 : : return false;
9324 : :
9325 : 206 : if (slp_node)
9326 : : {
9327 : : /* We eventually need to set a vector type on invariant
9328 : : arguments. */
9329 : : unsigned j;
9330 : : slp_tree child;
9331 : 618 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9332 : 412 : if (!vect_maybe_update_slp_op_vectype
9333 : 412 : (child, SLP_TREE_VECTYPE (slp_node)))
9334 : : {
9335 : 0 : if (dump_enabled_p ())
9336 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9337 : : "incompatible vector types for "
9338 : : "invariants\n");
9339 : 0 : return false;
9340 : : }
9341 : : }
9342 : :
9343 : : /* Verify we have set up compatible types. */
9344 : 206 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9345 : 206 : tree latch_vectype = NULL_TREE;
9346 : 206 : if (slp_node)
9347 : : {
9348 : 206 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
9349 : 206 : latch_vectype = SLP_TREE_VECTYPE (latch_def);
9350 : : }
9351 : : else
9352 : : {
9353 : 0 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, le);
9354 : 0 : if (TREE_CODE (latch_def) == SSA_NAME)
9355 : : {
9356 : 0 : stmt_vec_info latch_def_info = loop_vinfo->lookup_def (latch_def);
9357 : 0 : latch_def_info = vect_stmt_to_vectorize (latch_def_info);
9358 : 0 : latch_vectype = STMT_VINFO_VECTYPE (latch_def_info);
9359 : : }
9360 : : }
9361 : 206 : if (!types_compatible_p (latch_vectype, vectype))
9362 : : return false;
9363 : :
9364 : : /* The recurrence costs the initialization vector and one permute
9365 : : for each copy. With SLP the prologue value is explicitly
9366 : : represented and costed separately. */
9367 : 198 : unsigned prologue_cost = 0;
9368 : 198 : if (!slp_node)
9369 : 0 : prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9370 : : stmt_info, 0, vect_prologue);
9371 : 198 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9372 : : stmt_info, 0, vect_body);
9373 : 198 : if (dump_enabled_p ())
9374 : 42 : dump_printf_loc (MSG_NOTE, vect_location,
9375 : : "vectorizable_recurr: inside_cost = %d, "
9376 : : "prologue_cost = %d .\n", inside_cost,
9377 : : prologue_cost);
9378 : :
9379 : 198 : STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9380 : 198 : return true;
9381 : : }
9382 : :
9383 : 32 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9384 : 32 : basic_block bb = gimple_bb (phi);
9385 : 32 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9386 : 32 : if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9387 : : {
9388 : 2 : gimple_seq stmts = NULL;
9389 : 2 : preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9390 : 2 : gsi_insert_seq_on_edge_immediate (pe, stmts);
9391 : : }
9392 : 32 : tree vec_init = build_vector_from_val (vectype, preheader);
9393 : 32 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9394 : :
9395 : : /* Create the vectorized first-order PHI node. */
9396 : 32 : tree vec_dest = vect_get_new_vect_var (vectype,
9397 : : vect_simple_var, "vec_recur_");
9398 : 32 : gphi *new_phi = create_phi_node (vec_dest, bb);
9399 : 32 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9400 : :
9401 : : /* Insert shuffles the first-order recurrence autovectorization.
9402 : : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9403 : 32 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
9404 : :
9405 : : /* Insert the required permute after the latch definition. The
9406 : : second and later operands are tentative and will be updated when we have
9407 : : vectorized the latch definition. */
9408 : 32 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9409 : 32 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9410 : 32 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9411 : 32 : gsi_next (&gsi2);
9412 : :
9413 : 84 : for (unsigned i = 0; i < ncopies; ++i)
9414 : : {
9415 : 52 : vec_dest = make_ssa_name (vectype);
9416 : 52 : gassign *vperm
9417 : 84 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9418 : 32 : i == 0 ? gimple_phi_result (new_phi) : NULL,
9419 : : NULL, perm);
9420 : 52 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9421 : :
9422 : 52 : if (slp_node)
9423 : 52 : slp_node->push_vec_def (vperm);
9424 : : else
9425 : 0 : STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9426 : : }
9427 : :
9428 : 32 : if (!slp_node)
9429 : 0 : *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9430 : : return true;
9431 : 262 : }
9432 : :
9433 : : /* Return true if VECTYPE represents a vector that requires lowering
9434 : : by the vector lowering pass. */
9435 : :
9436 : : bool
9437 : 631634 : vect_emulated_vector_p (tree vectype)
9438 : : {
9439 : 1263268 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9440 : 634246 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9441 : 2594 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9442 : : }
9443 : :
9444 : : /* Return true if we can emulate CODE on an integer mode representation
9445 : : of a vector. */
9446 : :
9447 : : bool
9448 : 21707 : vect_can_vectorize_without_simd_p (tree_code code)
9449 : : {
9450 : 21707 : switch (code)
9451 : : {
9452 : : case PLUS_EXPR:
9453 : : case MINUS_EXPR:
9454 : : case NEGATE_EXPR:
9455 : : case BIT_AND_EXPR:
9456 : : case BIT_IOR_EXPR:
9457 : : case BIT_XOR_EXPR:
9458 : : case BIT_NOT_EXPR:
9459 : : return true;
9460 : :
9461 : 12943 : default:
9462 : 12943 : return false;
9463 : : }
9464 : : }
9465 : :
9466 : : /* Likewise, but taking a code_helper. */
9467 : :
9468 : : bool
9469 : 1565 : vect_can_vectorize_without_simd_p (code_helper code)
9470 : : {
9471 : 1565 : return (code.is_tree_code ()
9472 : 1565 : && vect_can_vectorize_without_simd_p (tree_code (code)));
9473 : : }
9474 : :
9475 : : /* Create vector init for vectorized iv. */
9476 : : static tree
9477 : 835 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9478 : : tree step_expr, poly_uint64 nunits,
9479 : : tree vectype,
9480 : : enum vect_induction_op_type induction_type)
9481 : : {
9482 : 835 : unsigned HOST_WIDE_INT const_nunits;
9483 : 835 : tree vec_shift, vec_init, new_name;
9484 : 835 : unsigned i;
9485 : 835 : tree itype = TREE_TYPE (vectype);
9486 : :
9487 : : /* iv_loop is the loop to be vectorized. Create:
9488 : : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9489 : 835 : new_name = gimple_convert (stmts, itype, init_expr);
9490 : 835 : switch (induction_type)
9491 : : {
9492 : 18 : case vect_step_op_shr:
9493 : 18 : case vect_step_op_shl:
9494 : : /* Build the Initial value from shift_expr. */
9495 : 18 : vec_init = gimple_build_vector_from_val (stmts,
9496 : : vectype,
9497 : : new_name);
9498 : 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9499 : : build_zero_cst (itype), step_expr);
9500 : 18 : vec_init = gimple_build (stmts,
9501 : : (induction_type == vect_step_op_shr
9502 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
9503 : : vectype, vec_init, vec_shift);
9504 : 18 : break;
9505 : :
9506 : 741 : case vect_step_op_neg:
9507 : 741 : {
9508 : 741 : vec_init = gimple_build_vector_from_val (stmts,
9509 : : vectype,
9510 : : new_name);
9511 : 741 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9512 : : vectype, vec_init);
9513 : : /* The encoding has 2 interleaved stepped patterns. */
9514 : 741 : vec_perm_builder sel (nunits, 2, 3);
9515 : 741 : sel.quick_grow (6);
9516 : 3705 : for (i = 0; i < 3; i++)
9517 : : {
9518 : 2223 : sel[2 * i] = i;
9519 : 2223 : sel[2 * i + 1] = i + nunits;
9520 : : }
9521 : 741 : vec_perm_indices indices (sel, 2, nunits);
9522 : : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9523 : : fail when vec_init is const vector. In that situation vec_perm is not
9524 : : really needed. */
9525 : 741 : tree perm_mask_even
9526 : 741 : = vect_gen_perm_mask_any (vectype, indices);
9527 : 741 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9528 : : vectype,
9529 : : vec_init, vec_neg,
9530 : : perm_mask_even);
9531 : 741 : }
9532 : 741 : break;
9533 : :
9534 : 76 : case vect_step_op_mul:
9535 : 76 : {
9536 : : /* Use unsigned mult to avoid UD integer overflow. */
9537 : 76 : gcc_assert (nunits.is_constant (&const_nunits));
9538 : 76 : tree utype = unsigned_type_for (itype);
9539 : 76 : tree uvectype = build_vector_type (utype,
9540 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
9541 : 76 : new_name = gimple_convert (stmts, utype, new_name);
9542 : 76 : vec_init = gimple_build_vector_from_val (stmts,
9543 : : uvectype,
9544 : : new_name);
9545 : 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
9546 : 76 : tree elt_step = build_one_cst (utype);
9547 : :
9548 : 76 : elts.quick_push (elt_step);
9549 : 660 : for (i = 1; i < const_nunits; i++)
9550 : : {
9551 : : /* Create: new_name_i = new_name + step_expr. */
9552 : 508 : elt_step = gimple_build (stmts, MULT_EXPR,
9553 : : utype, elt_step, step_expr);
9554 : 508 : elts.quick_push (elt_step);
9555 : : }
9556 : : /* Create a vector from [new_name_0, new_name_1, ...,
9557 : : new_name_nunits-1]. */
9558 : 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
9559 : 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9560 : : vec_init, vec_mul);
9561 : 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
9562 : 76 : }
9563 : 76 : break;
9564 : :
9565 : 0 : default:
9566 : 0 : gcc_unreachable ();
9567 : : }
9568 : :
9569 : 835 : return vec_init;
9570 : : }
9571 : :
9572 : : /* Peel init_expr by skip_niter for induction_type. */
9573 : : tree
9574 : 84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9575 : : tree skip_niters, tree step_expr,
9576 : : enum vect_induction_op_type induction_type)
9577 : : {
9578 : 84 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9579 : 84 : tree type = TREE_TYPE (init_expr);
9580 : 84 : unsigned prec = TYPE_PRECISION (type);
9581 : 84 : switch (induction_type)
9582 : : {
9583 : 0 : case vect_step_op_neg:
9584 : 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
9585 : 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9586 : : /* else no change. */
9587 : : break;
9588 : :
9589 : 12 : case vect_step_op_shr:
9590 : 12 : case vect_step_op_shl:
9591 : 12 : skip_niters = gimple_convert (stmts, type, skip_niters);
9592 : 12 : step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9593 : : /* When shift mount >= precision, need to avoid UD.
9594 : : In the original loop, there's no UD, and according to semantic,
9595 : : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9596 : 12 : if (!tree_fits_uhwi_p (step_expr)
9597 : 12 : || tree_to_uhwi (step_expr) >= prec)
9598 : : {
9599 : 6 : if (induction_type == vect_step_op_shl
9600 : 6 : || TYPE_UNSIGNED (type))
9601 : 4 : init_expr = build_zero_cst (type);
9602 : : else
9603 : 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9604 : : init_expr,
9605 : 4 : wide_int_to_tree (type, prec - 1));
9606 : : }
9607 : : else
9608 : 8 : init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9609 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
9610 : : type, init_expr, step_expr);
9611 : : break;
9612 : :
9613 : 72 : case vect_step_op_mul:
9614 : 72 : {
9615 : 72 : tree utype = unsigned_type_for (type);
9616 : 72 : init_expr = gimple_convert (stmts, utype, init_expr);
9617 : 72 : wide_int skipn = wi::to_wide (skip_niters);
9618 : 72 : wide_int begin = wi::to_wide (step_expr);
9619 : 72 : auto_mpz base, exp, mod, res;
9620 : 72 : wi::to_mpz (begin, base, TYPE_SIGN (type));
9621 : 72 : wi::to_mpz (skipn, exp, UNSIGNED);
9622 : 72 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9623 : 72 : mpz_powm (res, base, exp, mod);
9624 : 72 : begin = wi::from_mpz (utype, res, true);
9625 : 72 : tree mult_expr = wide_int_to_tree (utype, begin);
9626 : 72 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
9627 : : init_expr, mult_expr);
9628 : 72 : init_expr = gimple_convert (stmts, type, init_expr);
9629 : 72 : }
9630 : 72 : break;
9631 : :
9632 : 0 : default:
9633 : 0 : gcc_unreachable ();
9634 : : }
9635 : :
9636 : 84 : return init_expr;
9637 : : }
9638 : :
9639 : : /* Create vector step for vectorized iv. */
9640 : : static tree
9641 : 1072 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9642 : : poly_uint64 vf,
9643 : : enum vect_induction_op_type induction_type)
9644 : : {
9645 : 1072 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9646 : 1072 : tree new_name = NULL;
9647 : : /* Step should be pow (step, vf) for mult induction. */
9648 : 1072 : if (induction_type == vect_step_op_mul)
9649 : : {
9650 : 76 : gcc_assert (vf.is_constant ());
9651 : 76 : wide_int begin = wi::to_wide (step_expr);
9652 : :
9653 : 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9654 : 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
9655 : :
9656 : 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9657 : 76 : }
9658 : 996 : else if (induction_type == vect_step_op_neg)
9659 : : /* Do nothing. */
9660 : : ;
9661 : : else
9662 : 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9663 : : expr, step_expr);
9664 : 1072 : return new_name;
9665 : : }
9666 : :
9667 : : static tree
9668 : 1072 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9669 : : stmt_vec_info stmt_info,
9670 : : tree new_name, tree vectype,
9671 : : enum vect_induction_op_type induction_type)
9672 : : {
9673 : : /* No step is needed for neg induction. */
9674 : 1072 : if (induction_type == vect_step_op_neg)
9675 : : return NULL;
9676 : :
9677 : 94 : tree t = unshare_expr (new_name);
9678 : 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
9679 : : || TREE_CODE (new_name) == SSA_NAME);
9680 : 94 : tree new_vec = build_vector_from_val (vectype, t);
9681 : 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9682 : : new_vec, vectype, NULL);
9683 : 94 : return vec_step;
9684 : : }
9685 : :
9686 : : /* Update vectorized iv with vect_step, induc_def is init. */
9687 : : static tree
9688 : 1254 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9689 : : tree induc_def, tree vec_step,
9690 : : enum vect_induction_op_type induction_type)
9691 : : {
9692 : 1254 : tree vec_def = induc_def;
9693 : 1254 : switch (induction_type)
9694 : : {
9695 : 76 : case vect_step_op_mul:
9696 : 76 : {
9697 : : /* Use unsigned mult to avoid UD integer overflow. */
9698 : 76 : tree uvectype
9699 : 76 : = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9700 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
9701 : 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
9702 : 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
9703 : 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9704 : : vec_def, vec_step);
9705 : 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
9706 : : }
9707 : 76 : break;
9708 : :
9709 : 12 : case vect_step_op_shr:
9710 : 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9711 : : vec_def, vec_step);
9712 : 12 : break;
9713 : :
9714 : 6 : case vect_step_op_shl:
9715 : 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9716 : : vec_def, vec_step);
9717 : 6 : break;
9718 : : case vect_step_op_neg:
9719 : : vec_def = induc_def;
9720 : : /* Do nothing. */
9721 : : break;
9722 : 0 : default:
9723 : 0 : gcc_unreachable ();
9724 : : }
9725 : :
9726 : 1254 : return vec_def;
9727 : :
9728 : : }
9729 : :
9730 : : /* Function vectorizable_nonlinear_induction
9731 : :
9732 : : Check if STMT_INFO performs an nonlinear induction computation that can be
9733 : : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9734 : : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9735 : : basic block.
9736 : : Return true if STMT_INFO is vectorizable in this way. */
9737 : :
9738 : : static bool
9739 : 9983 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9740 : : stmt_vec_info stmt_info,
9741 : : gimple **vec_stmt, slp_tree slp_node,
9742 : : stmt_vector_for_cost *cost_vec)
9743 : : {
9744 : 9983 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9745 : 9983 : unsigned ncopies;
9746 : 9983 : bool nested_in_vect_loop = false;
9747 : 9983 : class loop *iv_loop;
9748 : 9983 : tree vec_def;
9749 : 9983 : edge pe = loop_preheader_edge (loop);
9750 : 9983 : basic_block new_bb;
9751 : 9983 : tree vec_init, vec_step;
9752 : 9983 : tree new_name;
9753 : 9983 : gimple *new_stmt;
9754 : 9983 : gphi *induction_phi;
9755 : 9983 : tree induc_def, vec_dest;
9756 : 9983 : tree init_expr, step_expr;
9757 : 9983 : tree niters_skip;
9758 : 9983 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9759 : 9983 : unsigned i;
9760 : 9983 : gimple_stmt_iterator si;
9761 : :
9762 : 9983 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9763 : :
9764 : 9983 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9765 : 9983 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9766 : 9983 : enum vect_induction_op_type induction_type
9767 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9768 : :
9769 : 9983 : gcc_assert (induction_type > vect_step_op_add);
9770 : :
9771 : 9983 : ncopies = vect_get_num_copies (loop_vinfo, slp_node, vectype);
9772 : 9983 : gcc_assert (ncopies >= 1);
9773 : :
9774 : : /* FORNOW. Only handle nonlinear induction in the same loop. */
9775 : 9983 : if (nested_in_vect_loop_p (loop, stmt_info))
9776 : : {
9777 : 0 : if (dump_enabled_p ())
9778 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9779 : : "nonlinear induction in nested loop.\n");
9780 : 0 : return false;
9781 : : }
9782 : :
9783 : 9983 : iv_loop = loop;
9784 : 9983 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9785 : :
9786 : : /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
9787 : : vector iv update for each iv and a permutation to generate wanted
9788 : : vector iv. */
9789 : 9983 : if (SLP_TREE_LANES (slp_node) > 1)
9790 : : {
9791 : 0 : if (dump_enabled_p ())
9792 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9793 : : "SLP induction not supported for nonlinear"
9794 : : " induction.\n");
9795 : 0 : return false;
9796 : : }
9797 : :
9798 : 9983 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9799 : : {
9800 : 0 : if (dump_enabled_p ())
9801 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9802 : : "floating point nonlinear induction vectorization"
9803 : : " not supported.\n");
9804 : 0 : return false;
9805 : : }
9806 : :
9807 : 9983 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9808 : 9983 : init_expr = vect_phi_initial_value (phi);
9809 : 9983 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9810 : : && TREE_CODE (step_expr) == INTEGER_CST);
9811 : : /* step_expr should be aligned with init_expr,
9812 : : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9813 : 9983 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9814 : :
9815 : 9983 : if (TREE_CODE (init_expr) == INTEGER_CST)
9816 : 2494 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9817 : 7489 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9818 : : {
9819 : : /* INIT_EXPR could be a bit_field, bail out for such case. */
9820 : 4 : if (dump_enabled_p ())
9821 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9822 : : "nonlinear induction vectorization failed:"
9823 : : " component type of vectype is not a nop conversion"
9824 : : " from type of init_expr.\n");
9825 : 4 : return false;
9826 : : }
9827 : :
9828 : 9979 : switch (induction_type)
9829 : : {
9830 : 2205 : case vect_step_op_neg:
9831 : 2205 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9832 : : return false;
9833 : 2205 : if (TREE_CODE (init_expr) != INTEGER_CST
9834 : 192 : && TREE_CODE (init_expr) != REAL_CST)
9835 : : {
9836 : : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9837 : 192 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9838 : 0 : return false;
9839 : :
9840 : : /* The encoding has 2 interleaved stepped patterns. */
9841 : 192 : vec_perm_builder sel (nunits, 2, 3);
9842 : 192 : machine_mode mode = TYPE_MODE (vectype);
9843 : 192 : sel.quick_grow (6);
9844 : 960 : for (i = 0; i < 3; i++)
9845 : : {
9846 : 576 : sel[i * 2] = i;
9847 : 576 : sel[i * 2 + 1] = i + nunits;
9848 : : }
9849 : 192 : vec_perm_indices indices (sel, 2, nunits);
9850 : 192 : if (!can_vec_perm_const_p (mode, mode, indices))
9851 : 0 : return false;
9852 : 192 : }
9853 : : break;
9854 : :
9855 : 666 : case vect_step_op_mul:
9856 : 666 : {
9857 : : /* Check for backend support of MULT_EXPR. */
9858 : 666 : if (!directly_supported_p (MULT_EXPR, vectype))
9859 : : return false;
9860 : :
9861 : : /* ?? How to construct vector step for variable number vector.
9862 : : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9863 : : if (!vf.is_constant ())
9864 : : return false;
9865 : : }
9866 : : break;
9867 : :
9868 : 7012 : case vect_step_op_shr:
9869 : : /* Check for backend support of RSHIFT_EXPR. */
9870 : 7012 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9871 : : return false;
9872 : :
9873 : : /* Don't shift more than type precision to avoid UD. */
9874 : 26 : if (!tree_fits_uhwi_p (step_expr)
9875 : 26 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9876 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9877 : : return false;
9878 : : break;
9879 : :
9880 : 96 : case vect_step_op_shl:
9881 : : /* Check for backend support of RSHIFT_EXPR. */
9882 : 96 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9883 : : return false;
9884 : :
9885 : : /* Don't shift more than type precision to avoid UD. */
9886 : 12 : if (!tree_fits_uhwi_p (step_expr)
9887 : 12 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9888 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9889 : : return false;
9890 : :
9891 : : break;
9892 : :
9893 : 0 : default:
9894 : 0 : gcc_unreachable ();
9895 : : }
9896 : :
9897 : 2748 : if (!vec_stmt) /* transformation not required. */
9898 : : {
9899 : 1913 : unsigned inside_cost = 0, prologue_cost = 0;
9900 : : /* loop cost for vec_loop. Neg induction doesn't have any
9901 : : inside_cost. */
9902 : 1913 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9903 : : stmt_info, 0, vect_body);
9904 : :
9905 : : /* loop cost for vec_loop. Neg induction doesn't have any
9906 : : inside_cost. */
9907 : 1913 : if (induction_type == vect_step_op_neg)
9908 : 1464 : inside_cost = 0;
9909 : :
9910 : : /* prologue cost for vec_init and vec_step. */
9911 : 1913 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9912 : : stmt_info, 0, vect_prologue);
9913 : :
9914 : 1913 : if (dump_enabled_p ())
9915 : 62 : dump_printf_loc (MSG_NOTE, vect_location,
9916 : : "vect_model_induction_cost: inside_cost = %d, "
9917 : : "prologue_cost = %d. \n", inside_cost,
9918 : : prologue_cost);
9919 : :
9920 : 1913 : STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9921 : 1913 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9922 : 1913 : return true;
9923 : : }
9924 : :
9925 : : /* Transform. */
9926 : :
9927 : : /* Compute a vector variable, initialized with the first VF values of
9928 : : the induction variable. E.g., for an iv with IV_PHI='X' and
9929 : : evolution S, for a vector of 4 units, we want to compute:
9930 : : [X, X + S, X + 2*S, X + 3*S]. */
9931 : :
9932 : 835 : if (dump_enabled_p ())
9933 : 34 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9934 : :
9935 : 835 : pe = loop_preheader_edge (iv_loop);
9936 : : /* Find the first insertion point in the BB. */
9937 : 835 : basic_block bb = gimple_bb (phi);
9938 : 835 : si = gsi_after_labels (bb);
9939 : :
9940 : 835 : gimple_seq stmts = NULL;
9941 : :
9942 : 835 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9943 : : /* If we are using the loop mask to "peel" for alignment then we need
9944 : : to adjust the start value here. */
9945 : 835 : if (niters_skip != NULL_TREE)
9946 : 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9947 : : step_expr, induction_type);
9948 : :
9949 : 835 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9950 : : step_expr, nunits, vectype,
9951 : : induction_type);
9952 : 835 : if (stmts)
9953 : : {
9954 : 164 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9955 : 164 : gcc_assert (!new_bb);
9956 : : }
9957 : :
9958 : 835 : stmts = NULL;
9959 : 835 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9960 : : vf, induction_type);
9961 : 835 : if (stmts)
9962 : : {
9963 : 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9964 : 0 : gcc_assert (!new_bb);
9965 : : }
9966 : :
9967 : 835 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9968 : : new_name, vectype,
9969 : : induction_type);
9970 : : /* Create the following def-use cycle:
9971 : : loop prolog:
9972 : : vec_init = ...
9973 : : vec_step = ...
9974 : : loop:
9975 : : vec_iv = PHI <vec_init, vec_loop>
9976 : : ...
9977 : : STMT
9978 : : ...
9979 : : vec_loop = vec_iv + vec_step; */
9980 : :
9981 : : /* Create the induction-phi that defines the induction-operand. */
9982 : 835 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9983 : 835 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9984 : 835 : induc_def = PHI_RESULT (induction_phi);
9985 : :
9986 : : /* Create the iv update inside the loop. */
9987 : 835 : stmts = NULL;
9988 : 835 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9989 : : induc_def, vec_step,
9990 : : induction_type);
9991 : :
9992 : 835 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9993 : 835 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9994 : :
9995 : : /* Set the arguments of the phi node: */
9996 : 835 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9997 : 835 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9998 : : UNKNOWN_LOCATION);
9999 : :
10000 : 835 : slp_node->push_vec_def (induction_phi);
10001 : :
10002 : : /* In case that vectorization factor (VF) is bigger than the number
10003 : : of elements that we can fit in a vectype (nunits), we have to generate
10004 : : more than one vector stmt - i.e - we need to "unroll" the
10005 : : vector stmt by a factor VF/nunits. For more details see documentation
10006 : : in vectorizable_operation. */
10007 : :
10008 : 835 : if (ncopies > 1)
10009 : : {
10010 : 237 : stmts = NULL;
10011 : : /* FORNOW. This restriction should be relaxed. */
10012 : 237 : gcc_assert (!nested_in_vect_loop);
10013 : :
10014 : 237 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
10015 : : nunits, induction_type);
10016 : :
10017 : 237 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
10018 : : new_name, vectype,
10019 : : induction_type);
10020 : 237 : vec_def = induc_def;
10021 : 893 : for (i = 1; i < ncopies; i++)
10022 : : {
10023 : : /* vec_i = vec_prev + vec_step. */
10024 : 419 : stmts = NULL;
10025 : 419 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
10026 : : vec_def, vec_step,
10027 : : induction_type);
10028 : 419 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10029 : 419 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
10030 : 419 : slp_node->push_vec_def (new_stmt);
10031 : : }
10032 : : }
10033 : :
10034 : 835 : if (dump_enabled_p ())
10035 : 68 : dump_printf_loc (MSG_NOTE, vect_location,
10036 : : "transform induction: created def-use cycle: %G%G",
10037 : 34 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10038 : :
10039 : : return true;
10040 : : }
10041 : :
10042 : : /* Function vectorizable_induction
10043 : :
10044 : : Check if STMT_INFO performs an induction computation that can be vectorized.
10045 : : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
10046 : : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
10047 : : Return true if STMT_INFO is vectorizable in this way. */
10048 : :
10049 : : bool
10050 : 407638 : vectorizable_induction (loop_vec_info loop_vinfo,
10051 : : stmt_vec_info stmt_info,
10052 : : gimple **vec_stmt, slp_tree slp_node,
10053 : : stmt_vector_for_cost *cost_vec)
10054 : : {
10055 : 407638 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10056 : 407638 : bool nested_in_vect_loop = false;
10057 : 407638 : class loop *iv_loop;
10058 : 407638 : tree vec_def;
10059 : 407638 : edge pe = loop_preheader_edge (loop);
10060 : 407638 : basic_block new_bb;
10061 : 407638 : tree vec_init = NULL_TREE, vec_step, t;
10062 : 407638 : tree new_name;
10063 : 407638 : gphi *induction_phi;
10064 : 407638 : tree induc_def, vec_dest;
10065 : 407638 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10066 : 407638 : unsigned i;
10067 : 407638 : tree expr;
10068 : 407638 : tree index_vectype = NULL_TREE;
10069 : 407638 : gimple_stmt_iterator si;
10070 : 407638 : enum vect_induction_op_type induction_type
10071 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
10072 : :
10073 : 434391 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
10074 : 155451 : if (!phi)
10075 : : return false;
10076 : :
10077 : 155451 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10078 : : return false;
10079 : :
10080 : : /* Make sure it was recognized as induction computation. */
10081 : 155451 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
10082 : : return false;
10083 : :
10084 : : /* Handle nonlinear induction in a separate place. */
10085 : 151401 : if (induction_type != vect_step_op_add)
10086 : 9983 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
10087 : 9983 : vec_stmt, slp_node, cost_vec);
10088 : :
10089 : 141418 : tree vectype = SLP_TREE_VECTYPE (slp_node);
10090 : 141418 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10091 : :
10092 : : /* FORNOW. These restrictions should be relaxed. */
10093 : 141418 : if (nested_in_vect_loop_p (loop, stmt_info))
10094 : : {
10095 : 666 : imm_use_iterator imm_iter;
10096 : 666 : use_operand_p use_p;
10097 : 666 : gimple *exit_phi;
10098 : 666 : edge latch_e;
10099 : 666 : tree loop_arg;
10100 : :
10101 : 666 : exit_phi = NULL;
10102 : 666 : latch_e = loop_latch_edge (loop->inner);
10103 : 666 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
10104 : 1374 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
10105 : : {
10106 : 718 : gimple *use_stmt = USE_STMT (use_p);
10107 : 718 : if (is_gimple_debug (use_stmt))
10108 : 36 : continue;
10109 : :
10110 : 682 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
10111 : : {
10112 : : exit_phi = use_stmt;
10113 : : break;
10114 : : }
10115 : : }
10116 : 666 : if (exit_phi)
10117 : : {
10118 : 10 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10119 : 10 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10120 : 6 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10121 : : {
10122 : 4 : if (dump_enabled_p ())
10123 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10124 : : "inner-loop induction only used outside "
10125 : : "of the outer vectorized loop.\n");
10126 : 4 : return false;
10127 : : }
10128 : : }
10129 : :
10130 : 662 : nested_in_vect_loop = true;
10131 : 662 : iv_loop = loop->inner;
10132 : : }
10133 : : else
10134 : : iv_loop = loop;
10135 : 141414 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10136 : :
10137 : 141414 : if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
10138 : : {
10139 : : /* The current SLP code creates the step value element-by-element. */
10140 : : if (dump_enabled_p ())
10141 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10142 : : "SLP induction not supported for variable-length"
10143 : : " vectors.\n");
10144 : : return false;
10145 : : }
10146 : :
10147 : 141414 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10148 : : {
10149 : 12 : if (dump_enabled_p ())
10150 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10151 : : "floating point induction vectorization disabled\n");
10152 : 12 : return false;
10153 : : }
10154 : :
10155 : 141402 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10156 : 141402 : gcc_assert (step_expr != NULL_TREE);
10157 : 282758 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10158 : 282666 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10159 : : {
10160 : 12 : if (dump_enabled_p ())
10161 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10162 : : "bit-precision induction vectorization not "
10163 : : "supported.\n");
10164 : 12 : return false;
10165 : : }
10166 : 141390 : tree stept = TREE_TYPE (step_expr);
10167 : 141390 : tree step_vectype = get_same_sized_vectype (stept, vectype);
10168 : :
10169 : : /* Check for target support of the vectorized arithmetic used here. */
10170 : 141390 : if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
10171 : 141390 : || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
10172 : 22675 : return false;
10173 : 118715 : if (!nunits.is_constant ())
10174 : : {
10175 : : if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
10176 : : return false;
10177 : : /* FLOAT_EXPR when computing VEC_INIT for float inductions. */
10178 : : if (SCALAR_FLOAT_TYPE_P (stept))
10179 : : {
10180 : : tree index_type = build_nonstandard_integer_type
10181 : : (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
10182 : :
10183 : : index_vectype = build_vector_type (index_type, nunits);
10184 : : if (!can_float_p (TYPE_MODE (step_vectype),
10185 : : TYPE_MODE (index_vectype), 1))
10186 : : return false;
10187 : : }
10188 : : }
10189 : :
10190 : 118715 : if (!vec_stmt) /* transformation not required. */
10191 : : {
10192 : 310707 : unsigned inside_cost = 0, prologue_cost = 0;
10193 : : /* We eventually need to set a vector type on invariant
10194 : : arguments. */
10195 : : unsigned j;
10196 : : slp_tree child;
10197 : 310707 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10198 : 207138 : if (!vect_maybe_update_slp_op_vectype
10199 : 207138 : (child, SLP_TREE_VECTYPE (slp_node)))
10200 : : {
10201 : 0 : if (dump_enabled_p ())
10202 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10203 : : "incompatible vector types for "
10204 : : "invariants\n");
10205 : 0 : return false;
10206 : : }
10207 : : /* loop cost for vec_loop. */
10208 : 207138 : inside_cost = record_stmt_cost (cost_vec,
10209 : 103569 : SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10210 : : vector_stmt, stmt_info, 0, vect_body);
10211 : : /* prologue cost for vec_init (if not nested) and step. */
10212 : 103569 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10213 : : scalar_to_vec,
10214 : : stmt_info, 0, vect_prologue);
10215 : 103569 : if (dump_enabled_p ())
10216 : 4099 : dump_printf_loc (MSG_NOTE, vect_location,
10217 : : "vect_model_induction_cost: inside_cost = %d, "
10218 : : "prologue_cost = %d .\n", inside_cost,
10219 : : prologue_cost);
10220 : :
10221 : 103569 : STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10222 : 103569 : DUMP_VECT_SCOPE ("vectorizable_induction");
10223 : 103569 : return true;
10224 : : }
10225 : :
10226 : : /* Transform. */
10227 : :
10228 : : /* Compute a vector variable, initialized with the first VF values of
10229 : : the induction variable. E.g., for an iv with IV_PHI='X' and
10230 : : evolution S, for a vector of 4 units, we want to compute:
10231 : : [X, X + S, X + 2*S, X + 3*S]. */
10232 : :
10233 : 15146 : if (dump_enabled_p ())
10234 : 2766 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10235 : :
10236 : 15146 : pe = loop_preheader_edge (iv_loop);
10237 : : /* Find the first insertion point in the BB. */
10238 : 15146 : basic_block bb = gimple_bb (phi);
10239 : 15146 : si = gsi_after_labels (bb);
10240 : :
10241 : : /* For SLP induction we have to generate several IVs as for example
10242 : : with group size 3 we need
10243 : : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10244 : : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10245 : 15146 : gimple_stmt_iterator incr_si;
10246 : 15146 : bool insert_after;
10247 : 15146 : standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
10248 : :
10249 : : /* The initial values are vectorized, but any lanes > group_size
10250 : : need adjustment. */
10251 : 15146 : slp_tree init_node
10252 : 15146 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10253 : :
10254 : : /* Gather steps. Since we do not vectorize inductions as
10255 : : cycles we have to reconstruct the step from SCEV data. */
10256 : 15146 : unsigned group_size = SLP_TREE_LANES (slp_node);
10257 : 15146 : tree *steps = XALLOCAVEC (tree, group_size);
10258 : 15146 : tree *inits = XALLOCAVEC (tree, group_size);
10259 : 15146 : stmt_vec_info phi_info;
10260 : 31476 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10261 : : {
10262 : 16330 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10263 : 16330 : if (!init_node)
10264 : 16216 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10265 : : pe->dest_idx);
10266 : : }
10267 : :
10268 : : /* Now generate the IVs. */
10269 : 15146 : unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10270 : 30292 : gcc_assert (multiple_p (nunits * nvects, group_size));
10271 : 15146 : unsigned nivs;
10272 : 15146 : unsigned HOST_WIDE_INT const_nunits;
10273 : 15146 : if (nested_in_vect_loop)
10274 : : nivs = nvects;
10275 : 15046 : else if (nunits.is_constant (&const_nunits))
10276 : : {
10277 : : /* Compute the number of distinct IVs we need. First reduce
10278 : : group_size if it is a multiple of const_nunits so we get
10279 : : one IV for a group_size of 4 but const_nunits 2. */
10280 : 15046 : unsigned group_sizep = group_size;
10281 : 15046 : if (group_sizep % const_nunits == 0)
10282 : 109 : group_sizep = group_sizep / const_nunits;
10283 : 15046 : nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
10284 : : }
10285 : : else
10286 : : {
10287 : : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10288 : : nivs = 1;
10289 : : }
10290 : 15146 : gimple_seq init_stmts = NULL;
10291 : 15146 : tree lupdate_mul = NULL_TREE;
10292 : 100 : if (!nested_in_vect_loop)
10293 : : {
10294 : 15046 : if (nunits.is_constant (&const_nunits))
10295 : : {
10296 : : /* The number of iterations covered in one vector iteration. */
10297 : 15046 : unsigned lup_mul = (nvects * const_nunits) / group_size;
10298 : 15046 : lupdate_mul
10299 : 15046 : = build_vector_from_val (step_vectype,
10300 : 15046 : SCALAR_FLOAT_TYPE_P (stept)
10301 : 26 : ? build_real_from_wide (stept, lup_mul,
10302 : : UNSIGNED)
10303 : 30066 : : build_int_cstu (stept, lup_mul));
10304 : : }
10305 : : else
10306 : : {
10307 : : if (SCALAR_FLOAT_TYPE_P (stept))
10308 : : {
10309 : : tree tem = build_int_cst (integer_type_node, vf);
10310 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
10311 : : }
10312 : : else
10313 : : lupdate_mul = build_int_cst (stept, vf);
10314 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
10315 : : lupdate_mul);
10316 : : }
10317 : : }
10318 : 15146 : tree peel_mul = NULL_TREE;
10319 : 15146 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10320 : : {
10321 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
10322 : 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10323 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10324 : : else
10325 : 0 : peel_mul = gimple_convert (&init_stmts, stept,
10326 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10327 : 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
10328 : : step_vectype, peel_mul);
10329 : :
10330 : : /* If early break then we have to create a new PHI which we can use as
10331 : : an offset to adjust the induction reduction in early exits.
10332 : :
10333 : : This is because when peeling for alignment using masking, the first
10334 : : few elements of the vector can be inactive. As such if we find the
10335 : : entry in the first iteration we have adjust the starting point of
10336 : : the scalar code.
10337 : :
10338 : : We do this by creating a new scalar PHI that keeps track of whether
10339 : : we are the first iteration of the loop (with the additional masking)
10340 : : or whether we have taken a loop iteration already.
10341 : :
10342 : : The generated sequence:
10343 : :
10344 : : pre-header:
10345 : : bb1:
10346 : : i_1 = <number of leading inactive elements>
10347 : :
10348 : : header:
10349 : : bb2:
10350 : : i_2 = PHI <i_1(bb1), 0(latch)>
10351 : : …
10352 : :
10353 : : early-exit:
10354 : : bb3:
10355 : : i_3 = iv_step * i_2 + PHI<vector-iv>
10356 : :
10357 : : The first part of the adjustment to create i_1 and i_2 are done here
10358 : : and the last part creating i_3 is done in
10359 : : vectorizable_live_operations when the induction extraction is
10360 : : materialized. */
10361 : 0 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10362 : 0 : && !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
10363 : : {
10364 : 0 : auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10365 : 0 : tree ty_skip_niters = TREE_TYPE (skip_niters);
10366 : 0 : tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters,
10367 : : vect_scalar_var,
10368 : : "pfa_iv_offset");
10369 : 0 : gphi *nphi = create_phi_node (break_lhs_phi, bb);
10370 : 0 : add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION);
10371 : 0 : add_phi_arg (nphi, build_zero_cst (ty_skip_niters),
10372 : : loop_latch_edge (iv_loop), UNKNOWN_LOCATION);
10373 : :
10374 : 0 : LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo) = PHI_RESULT (nphi);
10375 : : }
10376 : : }
10377 : 15146 : tree step_mul = NULL_TREE;
10378 : 15146 : unsigned ivn;
10379 : 15146 : auto_vec<tree> vec_steps;
10380 : 30790 : for (ivn = 0; ivn < nivs; ++ivn)
10381 : : {
10382 : 15644 : gimple_seq stmts = NULL;
10383 : 15644 : bool invariant = true;
10384 : 15644 : if (nunits.is_constant (&const_nunits))
10385 : : {
10386 : 15644 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10387 : 15644 : tree_vector_builder init_elts (vectype, const_nunits, 1);
10388 : 15644 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10389 : 98048 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10390 : : {
10391 : : /* The scalar steps of the IVs. */
10392 : 82404 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10393 : 82404 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10394 : 82404 : step_elts.quick_push (elt);
10395 : 82404 : if (!init_node)
10396 : : {
10397 : : /* The scalar inits of the IVs if not vectorized. */
10398 : 81726 : elt = inits[(ivn*const_nunits + eltn) % group_size];
10399 : 81726 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
10400 : 81726 : TREE_TYPE (elt)))
10401 : 214 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10402 : 214 : TREE_TYPE (vectype), elt);
10403 : 81726 : init_elts.quick_push (elt);
10404 : : }
10405 : : /* The number of steps to add to the initial values. */
10406 : 82404 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10407 : 164808 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10408 : 164712 : ? build_real_from_wide (stept, mul_elt,
10409 : : UNSIGNED)
10410 : 164712 : : build_int_cstu (stept, mul_elt));
10411 : : }
10412 : 15644 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
10413 : 15644 : step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10414 : 15644 : if (!init_node)
10415 : 15528 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
10416 : 15644 : }
10417 : : else
10418 : : {
10419 : : if (init_node)
10420 : : ;
10421 : : else if (INTEGRAL_TYPE_P (TREE_TYPE (steps[0])))
10422 : : {
10423 : : new_name = gimple_convert (&init_stmts, stept, inits[0]);
10424 : : /* Build the initial value directly as a VEC_SERIES_EXPR. */
10425 : : vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
10426 : : step_vectype, new_name, steps[0]);
10427 : : if (!useless_type_conversion_p (vectype, step_vectype))
10428 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10429 : : vectype, vec_init);
10430 : : }
10431 : : else
10432 : : {
10433 : : /* Build:
10434 : : [base, base, base, ...]
10435 : : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10436 : : gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (steps[0])));
10437 : : gcc_assert (flag_associative_math);
10438 : : gcc_assert (index_vectype != NULL_TREE);
10439 : :
10440 : : tree index = build_index_vector (index_vectype, 0, 1);
10441 : : new_name = gimple_convert (&init_stmts, TREE_TYPE (steps[0]),
10442 : : inits[0]);
10443 : : tree base_vec = gimple_build_vector_from_val (&init_stmts,
10444 : : step_vectype,
10445 : : new_name);
10446 : : tree step_vec = gimple_build_vector_from_val (&init_stmts,
10447 : : step_vectype,
10448 : : steps[0]);
10449 : : vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
10450 : : step_vectype, index);
10451 : : vec_init = gimple_build (&init_stmts, MULT_EXPR,
10452 : : step_vectype, vec_init, step_vec);
10453 : : vec_init = gimple_build (&init_stmts, PLUS_EXPR,
10454 : : step_vectype, vec_init, base_vec);
10455 : : if (!useless_type_conversion_p (vectype, step_vectype))
10456 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10457 : : vectype, vec_init);
10458 : : }
10459 : : /* iv_loop is nested in the loop to be vectorized. Generate:
10460 : : vec_step = [S, S, S, S] */
10461 : : t = unshare_expr (steps[0]);
10462 : : gcc_assert (CONSTANT_CLASS_P (t)
10463 : : || TREE_CODE (t) == SSA_NAME);
10464 : : vec_step = gimple_build_vector_from_val (&init_stmts,
10465 : : step_vectype, t);
10466 : : }
10467 : 15644 : vec_steps.safe_push (vec_step);
10468 : 15644 : if (peel_mul)
10469 : : {
10470 : 0 : if (!step_mul)
10471 : : step_mul = peel_mul;
10472 : : else
10473 : 0 : step_mul = gimple_build (&init_stmts,
10474 : : MINUS_EXPR, step_vectype,
10475 : : step_mul, peel_mul);
10476 : : }
10477 : :
10478 : : /* Create the induction-phi that defines the induction-operand. */
10479 : 15644 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10480 : : "vec_iv_");
10481 : 15644 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
10482 : 15644 : induc_def = PHI_RESULT (induction_phi);
10483 : :
10484 : : /* Create the iv update inside the loop */
10485 : 15644 : tree up = vec_step;
10486 : 15644 : if (lupdate_mul)
10487 : : {
10488 : 15528 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10489 : : {
10490 : : /* When we're using loop_len produced by SELEC_VL, the
10491 : : non-final iterations are not always processing VF
10492 : : elements. So vectorize induction variable instead of
10493 : :
10494 : : _21 = vect_vec_iv_.6_22 + { VF, ... };
10495 : :
10496 : : We should generate:
10497 : :
10498 : : _35 = .SELECT_VL (ivtmp_33, VF);
10499 : : vect_cst__22 = [vec_duplicate_expr] _35;
10500 : : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10501 : 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10502 : 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
10503 : : vectype, 0, 0);
10504 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
10505 : 0 : expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
10506 : : else
10507 : 0 : expr = gimple_convert (&stmts, stept, len);
10508 : 0 : lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
10509 : : expr);
10510 : 0 : up = gimple_build (&stmts, MULT_EXPR,
10511 : : step_vectype, vec_step, lupdate_mul);
10512 : : }
10513 : : else
10514 : 15528 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10515 : : vec_step, lupdate_mul);
10516 : : }
10517 : 15644 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10518 : 15644 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
10519 : 15644 : vec_def = gimple_convert (&stmts, vectype, vec_def);
10520 : 15644 : insert_iv_increment (&incr_si, insert_after, stmts);
10521 : 15644 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10522 : : UNKNOWN_LOCATION);
10523 : :
10524 : 15644 : if (init_node)
10525 : 116 : vec_init = vect_get_slp_vect_def (init_node, ivn);
10526 : 15644 : if (!nested_in_vect_loop
10527 : 15644 : && step_mul
10528 : 15644 : && !integer_zerop (step_mul))
10529 : : {
10530 : 15118 : gcc_assert (invariant);
10531 : 15118 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10532 : 15118 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10533 : : vec_step, step_mul);
10534 : 15118 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10535 : : vec_def, up);
10536 : 15118 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10537 : : }
10538 : :
10539 : : /* Set the arguments of the phi node: */
10540 : 15644 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10541 : :
10542 : 15644 : slp_node->push_vec_def (induction_phi);
10543 : : }
10544 : 15146 : if (!nested_in_vect_loop)
10545 : : {
10546 : : /* Fill up to the number of vectors we need for the whole group. */
10547 : 15046 : if (nunits.is_constant (&const_nunits))
10548 : 15046 : nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
10549 : : else
10550 : : nivs = 1;
10551 : 15046 : vec_steps.reserve (nivs-ivn);
10552 : 30119 : for (; ivn < nivs; ++ivn)
10553 : : {
10554 : 27 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10555 : 27 : vec_steps.quick_push (vec_steps[0]);
10556 : : }
10557 : : }
10558 : :
10559 : : /* Re-use IVs when we can. We are generating further vector
10560 : : stmts by adding VF' * stride to the IVs generated above. */
10561 : 15146 : if (ivn < nvects)
10562 : : {
10563 : 3484 : if (nunits.is_constant (&const_nunits))
10564 : : {
10565 : 3484 : unsigned vfp = (least_common_multiple (group_size, const_nunits)
10566 : 3484 : / group_size);
10567 : 3484 : lupdate_mul
10568 : 3484 : = build_vector_from_val (step_vectype,
10569 : 3484 : SCALAR_FLOAT_TYPE_P (stept)
10570 : 7 : ? build_real_from_wide (stept,
10571 : 7 : vfp, UNSIGNED)
10572 : 6961 : : build_int_cstu (stept, vfp));
10573 : : }
10574 : : else
10575 : : {
10576 : : if (SCALAR_FLOAT_TYPE_P (stept))
10577 : : {
10578 : : tree tem = build_int_cst (integer_type_node, nunits);
10579 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
10580 : : }
10581 : : else
10582 : : lupdate_mul = build_int_cst (stept, nunits);
10583 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
10584 : : lupdate_mul);
10585 : : }
10586 : 11214 : for (; ivn < nvects; ++ivn)
10587 : : {
10588 : 7730 : gimple *iv
10589 : 7730 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10590 : 7730 : tree def = gimple_get_lhs (iv);
10591 : 7730 : if (ivn < 2*nivs)
10592 : 3530 : vec_steps[ivn - nivs]
10593 : 3530 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10594 : 3530 : vec_steps[ivn - nivs], lupdate_mul);
10595 : 7730 : gimple_seq stmts = NULL;
10596 : 7730 : def = gimple_convert (&stmts, step_vectype, def);
10597 : 23190 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10598 : 7730 : def, vec_steps[ivn % nivs]);
10599 : 7730 : def = gimple_convert (&stmts, vectype, def);
10600 : 7730 : if (gimple_code (iv) == GIMPLE_PHI)
10601 : 3530 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10602 : : else
10603 : : {
10604 : 4200 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10605 : 4200 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10606 : : }
10607 : 7730 : slp_node->push_vec_def (def);
10608 : : }
10609 : : }
10610 : :
10611 : 15146 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10612 : 15146 : gcc_assert (!new_bb);
10613 : :
10614 : 15146 : return true;
10615 : 15146 : }
10616 : :
10617 : : /* Function vectorizable_live_operation_1.
10618 : :
10619 : : helper function for vectorizable_live_operation. */
10620 : :
10621 : : static tree
10622 : 5170 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10623 : : stmt_vec_info stmt_info, basic_block exit_bb,
10624 : : tree vectype, int ncopies, slp_tree slp_node,
10625 : : tree bitsize, tree bitstart, tree vec_lhs,
10626 : : tree lhs_type, gimple_stmt_iterator *exit_gsi)
10627 : : {
10628 : 5170 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10629 : :
10630 : 5170 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10631 : 5170 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10632 : 10733 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10633 : 5563 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
10634 : :
10635 : 5170 : gimple_seq stmts = NULL;
10636 : 5170 : tree new_tree;
10637 : :
10638 : : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10639 : 5170 : if (integer_zerop (bitstart))
10640 : : {
10641 : 2710 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10642 : : vec_lhs_phi, bitsize, bitstart);
10643 : :
10644 : : /* Convert the extracted vector element to the scalar type. */
10645 : 2710 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10646 : : }
10647 : 2460 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10648 : : {
10649 : : /* Emit:
10650 : :
10651 : : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10652 : :
10653 : : where VEC_LHS is the vectorized live-out result and MASK is
10654 : : the loop mask for the final iteration. */
10655 : 0 : gcc_assert (ncopies == 1
10656 : : && (!slp_node || SLP_TREE_LANES (slp_node) == 1));
10657 : 0 : gimple_seq tem = NULL;
10658 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10659 : 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
10660 : : &LOOP_VINFO_LENS (loop_vinfo),
10661 : : 1, vectype, 0, 1);
10662 : 0 : gimple_seq_add_seq (&stmts, tem);
10663 : :
10664 : : /* BIAS - 1. */
10665 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10666 : 0 : tree bias_minus_one
10667 : 0 : = int_const_binop (MINUS_EXPR,
10668 : 0 : build_int_cst (TREE_TYPE (len), biasval),
10669 : 0 : build_one_cst (TREE_TYPE (len)));
10670 : :
10671 : : /* LAST_INDEX = LEN + (BIAS - 1). */
10672 : 0 : tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10673 : : len, bias_minus_one);
10674 : :
10675 : : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10676 : 0 : tree scalar_res
10677 : 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10678 : : vec_lhs_phi, last_index);
10679 : :
10680 : : /* Convert the extracted vector element to the scalar type. */
10681 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10682 : : }
10683 : 2460 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10684 : : {
10685 : : /* Emit:
10686 : :
10687 : : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10688 : :
10689 : : where VEC_LHS is the vectorized live-out result and MASK is
10690 : : the loop mask for the final iteration. */
10691 : 0 : gcc_assert (!slp_node || SLP_TREE_LANES (slp_node) == 1);
10692 : 0 : tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10693 : 0 : gimple_seq tem = NULL;
10694 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10695 : 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10696 : : &LOOP_VINFO_MASKS (loop_vinfo),
10697 : : 1, vectype, 0);
10698 : 0 : tree scalar_res;
10699 : 0 : gimple_seq_add_seq (&stmts, tem);
10700 : :
10701 : 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10702 : : mask, vec_lhs_phi);
10703 : :
10704 : : /* Convert the extracted vector element to the scalar type. */
10705 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10706 : : }
10707 : : else
10708 : : {
10709 : 2460 : tree bftype = TREE_TYPE (vectype);
10710 : 2460 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10711 : 85 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10712 : 2460 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10713 : 2460 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10714 : : &stmts, true, NULL_TREE);
10715 : : }
10716 : :
10717 : 5170 : *exit_gsi = gsi_after_labels (exit_bb);
10718 : 5170 : if (stmts)
10719 : 5170 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10720 : :
10721 : 5170 : return new_tree;
10722 : : }
10723 : :
10724 : : /* Function vectorizable_live_operation.
10725 : :
10726 : : STMT_INFO computes a value that is used outside the loop. Check if
10727 : : it can be supported. */
10728 : :
10729 : : bool
10730 : 290873 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10731 : : slp_tree slp_node, slp_instance slp_node_instance,
10732 : : int slp_index, bool vec_stmt_p,
10733 : : stmt_vector_for_cost *cost_vec)
10734 : : {
10735 : 290873 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10736 : 290873 : imm_use_iterator imm_iter;
10737 : 290873 : tree lhs, lhs_type, bitsize;
10738 : 290873 : tree vectype = (slp_node
10739 : 290873 : ? SLP_TREE_VECTYPE (slp_node)
10740 : : : STMT_VINFO_VECTYPE (stmt_info));
10741 : 290873 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10742 : 290873 : int ncopies;
10743 : 290873 : gimple *use_stmt;
10744 : 290873 : use_operand_p use_p;
10745 : 290873 : auto_vec<tree> vec_oprnds;
10746 : 290873 : int vec_entry = 0;
10747 : 290873 : poly_uint64 vec_index = 0;
10748 : :
10749 : 290873 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10750 : : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10751 : :
10752 : : /* If a stmt of a reduction is live, vectorize it via
10753 : : vect_create_epilog_for_reduction. vectorizable_reduction assessed
10754 : : validity so just trigger the transform here. */
10755 : 292688 : if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10756 : : {
10757 : 51566 : if (!vec_stmt_p)
10758 : : return true;
10759 : : /* For SLP reductions we vectorize the epilogue for all involved stmts
10760 : : together. */
10761 : 22313 : if (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) && slp_index != 0)
10762 : : return true;
10763 : 22007 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10764 : 22007 : gcc_assert (reduc_info->is_reduc_info);
10765 : 22007 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10766 : 22007 : || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10767 : : return true;
10768 : :
10769 : 21173 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10770 : 21173 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10771 : 21169 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10772 : : slp_node_instance,
10773 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10774 : :
10775 : : /* If early break we only have to materialize the reduction on the merge
10776 : : block, but we have to find an alternate exit first. */
10777 : 21173 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10778 : : {
10779 : 23 : slp_tree phis_node = slp_node ? slp_node_instance->reduc_phis : NULL;
10780 : 69 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10781 : 23 : if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10782 : : {
10783 : 23 : vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
10784 : : phis_node, slp_node_instance,
10785 : : exit);
10786 : 23 : break;
10787 : 23 : }
10788 : 23 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10789 : 4 : vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
10790 : : phis_node, slp_node_instance,
10791 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10792 : : }
10793 : :
10794 : 21173 : return true;
10795 : : }
10796 : :
10797 : : /* If STMT is not relevant and it is a simple assignment and its inputs are
10798 : : invariant then it can remain in place, unvectorized. The original last
10799 : : scalar value that it computes will be used. */
10800 : 239307 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10801 : : {
10802 : 0 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10803 : 0 : if (dump_enabled_p ())
10804 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
10805 : : "statement is simple and uses invariant. Leaving in "
10806 : : "place.\n");
10807 : 0 : return true;
10808 : : }
10809 : :
10810 : 239307 : if (slp_node)
10811 : : ncopies = 1;
10812 : : else
10813 : 0 : ncopies = vect_get_num_copies (loop_vinfo, vectype);
10814 : :
10815 : 0 : if (slp_node)
10816 : : {
10817 : 239307 : gcc_assert (slp_index >= 0);
10818 : :
10819 : : /* Get the last occurrence of the scalar index from the concatenation of
10820 : : all the slp vectors. Calculate which slp vector it is and the index
10821 : : within. */
10822 : 239307 : int num_scalar = SLP_TREE_LANES (slp_node);
10823 : 239307 : int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10824 : 239307 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10825 : :
10826 : : /* Calculate which vector contains the result, and which lane of
10827 : : that vector we need. */
10828 : 239307 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10829 : : {
10830 : : if (dump_enabled_p ())
10831 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10832 : : "Cannot determine which vector holds the"
10833 : : " final result.\n");
10834 : : return false;
10835 : : }
10836 : : }
10837 : :
10838 : 239307 : if (!vec_stmt_p)
10839 : : {
10840 : : /* No transformation required. */
10841 : 196419 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10842 : : {
10843 : 1 : if (slp_node && SLP_TREE_LANES (slp_node) != 1)
10844 : : {
10845 : 0 : if (dump_enabled_p ())
10846 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10847 : : "can't operate on partial vectors "
10848 : : "because an SLP statement is live after "
10849 : : "the loop.\n");
10850 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10851 : : }
10852 : 1 : else if (ncopies > 1
10853 : 1 : || (slp_node && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1))
10854 : : {
10855 : 0 : if (dump_enabled_p ())
10856 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10857 : : "can't operate on partial vectors "
10858 : : "because ncopies is greater than 1.\n");
10859 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10860 : : }
10861 : : else
10862 : : {
10863 : 1 : gcc_assert (ncopies == 1
10864 : : && (!slp_node || SLP_TREE_LANES (slp_node) == 1));
10865 : 1 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10866 : : OPTIMIZE_FOR_SPEED))
10867 : 0 : vect_record_loop_mask (loop_vinfo,
10868 : : &LOOP_VINFO_MASKS (loop_vinfo),
10869 : : 1, vectype, NULL);
10870 : 1 : else if (can_vec_extract_var_idx_p (
10871 : 1 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10872 : 0 : vect_record_loop_len (loop_vinfo,
10873 : : &LOOP_VINFO_LENS (loop_vinfo),
10874 : : 1, vectype, 1);
10875 : : else
10876 : : {
10877 : 1 : if (dump_enabled_p ())
10878 : 0 : dump_printf_loc (
10879 : 0 : MSG_MISSED_OPTIMIZATION, vect_location,
10880 : : "can't operate on partial vectors "
10881 : : "because the target doesn't support extract "
10882 : : "last reduction.\n");
10883 : 1 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10884 : : }
10885 : : }
10886 : : }
10887 : : /* ??? Enable for loop costing as well. */
10888 : 1 : if (!loop_vinfo)
10889 : 99480 : record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10890 : : 0, vect_epilogue);
10891 : 196419 : return true;
10892 : : }
10893 : :
10894 : : /* Use the lhs of the original scalar statement. */
10895 : 42888 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10896 : 42888 : if (dump_enabled_p ())
10897 : 1436 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10898 : : "stmt %G", stmt);
10899 : :
10900 : 42888 : lhs = gimple_get_lhs (stmt);
10901 : 42888 : lhs_type = TREE_TYPE (lhs);
10902 : :
10903 : 42888 : bitsize = vector_element_bits_tree (vectype);
10904 : :
10905 : : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10906 : 42888 : tree vec_lhs, vec_lhs0, bitstart;
10907 : 42888 : gimple *vec_stmt, *vec_stmt0;
10908 : 42888 : if (slp_node)
10909 : : {
10910 : 42888 : gcc_assert (!loop_vinfo
10911 : : || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10912 : : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10913 : : || SLP_TREE_LANES (slp_node) == 1));
10914 : :
10915 : : /* Get the correct slp vectorized stmt. */
10916 : 42888 : vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10917 : 42888 : vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10918 : :
10919 : : /* In case we need to early break vectorize also get the first stmt. */
10920 : 42888 : vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10921 : 42888 : vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10922 : :
10923 : : /* Get entry to use. */
10924 : 42888 : bitstart = bitsize_int (vec_index);
10925 : 42888 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10926 : : }
10927 : : else
10928 : : {
10929 : : /* For multiple copies, get the last copy. */
10930 : 0 : vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10931 : 0 : vec_lhs = gimple_get_lhs (vec_stmt);
10932 : :
10933 : : /* In case we need to early break vectorize also get the first stmt. */
10934 : 0 : vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10935 : 0 : vec_lhs0 = gimple_get_lhs (vec_stmt0);
10936 : :
10937 : : /* Get the last lane in the vector. */
10938 : 0 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10939 : : }
10940 : :
10941 : 42888 : if (loop_vinfo)
10942 : : {
10943 : : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10944 : : requirement, insert one phi node for it. It looks like:
10945 : : loop;
10946 : : BB:
10947 : : # lhs' = PHI <lhs>
10948 : : ==>
10949 : : loop;
10950 : : BB:
10951 : : # vec_lhs' = PHI <vec_lhs>
10952 : : new_tree = lane_extract <vec_lhs', ...>;
10953 : : lhs' = new_tree; */
10954 : :
10955 : 5219 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10956 : : /* Check if we have a loop where the chosen exit is not the main exit,
10957 : : in these cases for an early break we restart the iteration the vector code
10958 : : did. For the live values we want the value at the start of the iteration
10959 : : rather than at the end. */
10960 : 5219 : edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10961 : 5219 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10962 : 22252 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10963 : 17033 : if (!is_gimple_debug (use_stmt)
10964 : 17033 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10965 : 5170 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10966 : : {
10967 : 5170 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10968 : 5170 : phi_arg_index_from_use (use_p));
10969 : 5170 : gcc_assert (loop_exit_edge_p (loop, e));
10970 : 5170 : bool main_exit_edge = e == main_e;
10971 : 5170 : tree tmp_vec_lhs = vec_lhs;
10972 : 5170 : tree tmp_bitstart = bitstart;
10973 : :
10974 : : /* For early exit where the exit is not in the BB that leads
10975 : : to the latch then we're restarting the iteration in the
10976 : : scalar loop. So get the first live value. */
10977 : 13024 : bool early_break_first_element_p
10978 : 5170 : = (all_exits_as_early_p || !main_exit_edge)
10979 : 5170 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def;
10980 : 2684 : if (early_break_first_element_p)
10981 : : {
10982 : 2684 : tmp_vec_lhs = vec_lhs0;
10983 : 2684 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10984 : : }
10985 : :
10986 : 5170 : gimple_stmt_iterator exit_gsi;
10987 : 5170 : tree new_tree
10988 : 5170 : = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10989 : : e->dest, vectype, ncopies,
10990 : : slp_node, bitsize,
10991 : : tmp_bitstart, tmp_vec_lhs,
10992 : : lhs_type, &exit_gsi);
10993 : :
10994 : 5170 : auto gsi = gsi_for_stmt (use_stmt);
10995 : 5170 : if (early_break_first_element_p
10996 : 2684 : && LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
10997 : : {
10998 : 0 : tree step_expr
10999 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
11000 : 0 : tree break_lhs_phi
11001 : : = LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo);
11002 : 0 : tree ty_skip_niters = TREE_TYPE (break_lhs_phi);
11003 : 0 : gimple_seq iv_stmts = NULL;
11004 : :
11005 : : /* Now create the PHI for the outside loop usage to
11006 : : retrieve the value for the offset counter. */
11007 : 0 : tree rphi_step
11008 : 0 : = gimple_convert (&iv_stmts, ty_skip_niters, step_expr);
11009 : 0 : tree tmp2
11010 : 0 : = gimple_build (&iv_stmts, MULT_EXPR,
11011 : : ty_skip_niters, rphi_step,
11012 : : break_lhs_phi);
11013 : :
11014 : 0 : if (POINTER_TYPE_P (TREE_TYPE (new_tree)))
11015 : : {
11016 : 0 : tmp2 = gimple_convert (&iv_stmts, sizetype, tmp2);
11017 : 0 : tmp2 = gimple_build (&iv_stmts, POINTER_PLUS_EXPR,
11018 : 0 : TREE_TYPE (new_tree), new_tree,
11019 : : tmp2);
11020 : : }
11021 : : else
11022 : : {
11023 : 0 : tmp2 = gimple_convert (&iv_stmts, TREE_TYPE (new_tree),
11024 : : tmp2);
11025 : 0 : tmp2 = gimple_build (&iv_stmts, PLUS_EXPR,
11026 : 0 : TREE_TYPE (new_tree), new_tree,
11027 : : tmp2);
11028 : : }
11029 : :
11030 : 0 : new_tree = tmp2;
11031 : 0 : gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
11032 : : }
11033 : :
11034 : 5170 : tree lhs_phi = gimple_phi_result (use_stmt);
11035 : 5170 : remove_phi_node (&gsi, false);
11036 : 5170 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
11037 : 5170 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
11038 : 5170 : break;
11039 : 5219 : }
11040 : :
11041 : : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
11042 : 17082 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11043 : 11863 : gcc_assert (is_gimple_debug (use_stmt)
11044 : 5219 : || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
11045 : : }
11046 : : else
11047 : : {
11048 : : /* For basic-block vectorization simply insert the lane-extraction. */
11049 : 37669 : tree bftype = TREE_TYPE (vectype);
11050 : 37669 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
11051 : 0 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11052 : 37669 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
11053 : : vec_lhs, bitsize, bitstart);
11054 : 37669 : gimple_seq stmts = NULL;
11055 : 37669 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11056 : : &stmts, true, NULL_TREE);
11057 : 37669 : if (TREE_CODE (new_tree) == SSA_NAME
11058 : 75338 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11059 : 2 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11060 : 37669 : if (is_a <gphi *> (vec_stmt))
11061 : : {
11062 : 2658 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11063 : 2658 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11064 : : }
11065 : : else
11066 : : {
11067 : 35011 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11068 : 35011 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11069 : : }
11070 : :
11071 : : /* Replace use of lhs with newly computed result. If the use stmt is a
11072 : : single arg PHI, just replace all uses of PHI result. It's necessary
11073 : : because lcssa PHI defining lhs may be before newly inserted stmt. */
11074 : 37669 : use_operand_p use_p;
11075 : 37669 : stmt_vec_info use_stmt_info;
11076 : 198284 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11077 : 160615 : if (!is_gimple_debug (use_stmt)
11078 : 160615 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11079 : 109304 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11080 : : {
11081 : : /* ??? This can happen when the live lane ends up being
11082 : : rooted in a vector construction code-generated by an
11083 : : external SLP node (and code-generation for that already
11084 : : happened). See gcc.dg/vect/bb-slp-47.c.
11085 : : Doing this is what would happen if that vector CTOR
11086 : : were not code-generated yet so it is not too bad.
11087 : : ??? In fact we'd likely want to avoid this situation
11088 : : in the first place. */
11089 : 63830 : if (TREE_CODE (new_tree) == SSA_NAME
11090 : 63563 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11091 : 63563 : && gimple_code (use_stmt) != GIMPLE_PHI
11092 : 119241 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11093 : : use_stmt))
11094 : : {
11095 : 267 : if (dump_enabled_p ())
11096 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11097 : : "Using original scalar computation for "
11098 : : "live lane because use preceeds vector "
11099 : : "def\n");
11100 : 267 : continue;
11101 : : }
11102 : : /* ??? It can also happen that we end up pulling a def into
11103 : : a loop where replacing out-of-loop uses would require
11104 : : a new LC SSA PHI node. Retain the original scalar in
11105 : : those cases as well. PR98064. */
11106 : 64889 : if (TREE_CODE (new_tree) == SSA_NAME
11107 : 63296 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11108 : 63296 : && (gimple_bb (use_stmt)->loop_father
11109 : 63296 : != gimple_bb (vec_stmt)->loop_father)
11110 : 70213 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11111 : 6917 : gimple_bb (use_stmt)->loop_father))
11112 : : {
11113 : 1593 : if (dump_enabled_p ())
11114 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11115 : : "Using original scalar computation for "
11116 : : "live lane because there is an out-of-loop "
11117 : : "definition for it\n");
11118 : 1593 : continue;
11119 : : }
11120 : 187077 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11121 : 62687 : SET_USE (use_p, new_tree);
11122 : 61703 : update_stmt (use_stmt);
11123 : 37669 : }
11124 : : }
11125 : :
11126 : : return true;
11127 : 290873 : }
11128 : :
11129 : : /* Given loop represented by LOOP_VINFO, return true if computation of
11130 : : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11131 : : otherwise. */
11132 : :
11133 : : static bool
11134 : 56872 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
11135 : : {
11136 : : /* Constant case. */
11137 : 56872 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11138 : : {
11139 : 32933 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11140 : 32933 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11141 : :
11142 : 32933 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11143 : 32933 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11144 : 32933 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11145 : : return true;
11146 : : }
11147 : :
11148 : 23939 : widest_int max;
11149 : 23939 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11150 : : /* Check the upper bound of loop niters. */
11151 : 23939 : if (get_max_loop_iterations (loop, &max))
11152 : : {
11153 : 23939 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11154 : 23939 : signop sgn = TYPE_SIGN (type);
11155 : 23939 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11156 : 23939 : if (max < type_max)
11157 : 23731 : return true;
11158 : 23939 : }
11159 : : return false;
11160 : 23939 : }
11161 : :
11162 : : /* Return a mask type with half the number of elements as OLD_TYPE,
11163 : : given that it should have mode NEW_MODE. */
11164 : :
11165 : : tree
11166 : 4236 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11167 : : {
11168 : 4236 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11169 : 4236 : return build_truth_vector_type_for_mode (nunits, new_mode);
11170 : : }
11171 : :
11172 : : /* Return a mask type with twice as many elements as OLD_TYPE,
11173 : : given that it should have mode NEW_MODE. */
11174 : :
11175 : : tree
11176 : 2025 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11177 : : {
11178 : 2025 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11179 : 2025 : return build_truth_vector_type_for_mode (nunits, new_mode);
11180 : : }
11181 : :
11182 : : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11183 : : contain a sequence of NVECTORS masks that each control a vector of type
11184 : : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11185 : : these vector masks with the vector version of SCALAR_MASK. */
11186 : :
11187 : : void
11188 : 124 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11189 : : unsigned int nvectors, tree vectype, tree scalar_mask)
11190 : : {
11191 : 124 : gcc_assert (nvectors != 0);
11192 : :
11193 : 124 : if (scalar_mask)
11194 : : {
11195 : 22 : scalar_cond_masked_key cond (scalar_mask, nvectors);
11196 : 22 : loop_vinfo->scalar_cond_masked_set.add (cond);
11197 : : }
11198 : :
11199 : 124 : masks->mask_set.add (std::make_pair (vectype, nvectors));
11200 : 124 : }
11201 : :
11202 : : /* Given a complete set of masks MASKS, extract mask number INDEX
11203 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11204 : : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11205 : :
11206 : : See the comment above vec_loop_masks for more details about the mask
11207 : : arrangement. */
11208 : :
11209 : : tree
11210 : 89 : vect_get_loop_mask (loop_vec_info loop_vinfo,
11211 : : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11212 : : unsigned int nvectors, tree vectype, unsigned int index)
11213 : : {
11214 : 89 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11215 : : == vect_partial_vectors_while_ult)
11216 : : {
11217 : 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11218 : 0 : tree mask_type = rgm->type;
11219 : :
11220 : : /* Populate the rgroup's mask array, if this is the first time we've
11221 : : used it. */
11222 : 0 : if (rgm->controls.is_empty ())
11223 : : {
11224 : 0 : rgm->controls.safe_grow_cleared (nvectors, true);
11225 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
11226 : : {
11227 : 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11228 : : /* Provide a dummy definition until the real one is available. */
11229 : 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11230 : 0 : rgm->controls[i] = mask;
11231 : : }
11232 : : }
11233 : :
11234 : 0 : tree mask = rgm->controls[index];
11235 : 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11236 : 0 : TYPE_VECTOR_SUBPARTS (vectype)))
11237 : : {
11238 : : /* A loop mask for data type X can be reused for data type Y
11239 : : if X has N times more elements than Y and if Y's elements
11240 : : are N times bigger than X's. In this case each sequence
11241 : : of N elements in the loop mask will be all-zero or all-one.
11242 : : We can then view-convert the mask so that each sequence of
11243 : : N elements is replaced by a single element. */
11244 : 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11245 : : TYPE_VECTOR_SUBPARTS (vectype)));
11246 : 0 : gimple_seq seq = NULL;
11247 : 0 : mask_type = truth_type_for (vectype);
11248 : 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11249 : 0 : if (seq)
11250 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11251 : : }
11252 : 0 : return mask;
11253 : : }
11254 : 89 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11255 : : == vect_partial_vectors_avx512)
11256 : : {
11257 : : /* The number of scalars per iteration and the number of vectors are
11258 : : both compile-time constants. */
11259 : 89 : unsigned int nscalars_per_iter
11260 : 89 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11261 : 89 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11262 : :
11263 : 89 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11264 : :
11265 : : /* The stored nV is dependent on the mask type produced. */
11266 : 89 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11267 : : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11268 : : == rgm->factor);
11269 : 89 : nvectors = rgm->factor;
11270 : :
11271 : : /* Populate the rgroup's mask array, if this is the first time we've
11272 : : used it. */
11273 : 89 : if (rgm->controls.is_empty ())
11274 : : {
11275 : 10 : rgm->controls.safe_grow_cleared (nvectors, true);
11276 : 21 : for (unsigned int i = 0; i < nvectors; ++i)
11277 : : {
11278 : 11 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11279 : : /* Provide a dummy definition until the real one is available. */
11280 : 11 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11281 : 11 : rgm->controls[i] = mask;
11282 : : }
11283 : : }
11284 : 89 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11285 : : TYPE_VECTOR_SUBPARTS (vectype)))
11286 : 73 : return rgm->controls[index];
11287 : :
11288 : : /* Split the vector if needed. Since we are dealing with integer mode
11289 : : masks with AVX512 we can operate on the integer representation
11290 : : performing the whole vector shifting. */
11291 : 16 : unsigned HOST_WIDE_INT factor;
11292 : 16 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11293 : 16 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
11294 : 0 : gcc_assert (ok);
11295 : 16 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11296 : 16 : tree mask_type = truth_type_for (vectype);
11297 : 16 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11298 : 16 : unsigned vi = index / factor;
11299 : 16 : unsigned vpart = index % factor;
11300 : 16 : tree vec = rgm->controls[vi];
11301 : 16 : gimple_seq seq = NULL;
11302 : 16 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11303 : 16 : lang_hooks.types.type_for_mode
11304 : 16 : (TYPE_MODE (rgm->type), 1), vec);
11305 : : /* For integer mode masks simply shift the right bits into position. */
11306 : 16 : if (vpart != 0)
11307 : 12 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11308 : : build_int_cst (integer_type_node,
11309 : 24 : (TYPE_VECTOR_SUBPARTS (vectype)
11310 : 12 : * vpart)));
11311 : 16 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11312 : 16 : (TYPE_MODE (mask_type), 1), vec);
11313 : 16 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11314 : 16 : if (seq)
11315 : 16 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11316 : 16 : return vec;
11317 : : }
11318 : : else
11319 : 0 : gcc_unreachable ();
11320 : : }
11321 : :
11322 : : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11323 : : lengths for controlling an operation on VECTYPE. The operation splits
11324 : : each element of VECTYPE into FACTOR separate subelements, measuring the
11325 : : length as a number of these subelements. */
11326 : :
11327 : : void
11328 : 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11329 : : unsigned int nvectors, tree vectype, unsigned int factor)
11330 : : {
11331 : 0 : gcc_assert (nvectors != 0);
11332 : 0 : if (lens->length () < nvectors)
11333 : 0 : lens->safe_grow_cleared (nvectors, true);
11334 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
11335 : :
11336 : : /* The number of scalars per iteration, scalar occupied bytes and
11337 : : the number of vectors are both compile-time constants. */
11338 : 0 : unsigned int nscalars_per_iter
11339 : 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11340 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11341 : :
11342 : 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11343 : : {
11344 : : /* For now, we only support cases in which all loads and stores fall back
11345 : : to VnQI or none do. */
11346 : 0 : gcc_assert (!rgl->max_nscalars_per_iter
11347 : : || (rgl->factor == 1 && factor == 1)
11348 : : || (rgl->max_nscalars_per_iter * rgl->factor
11349 : : == nscalars_per_iter * factor));
11350 : 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
11351 : 0 : rgl->type = vectype;
11352 : 0 : rgl->factor = factor;
11353 : : }
11354 : 0 : }
11355 : :
11356 : : /* Given a complete set of lengths LENS, extract length number INDEX
11357 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11358 : : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11359 : : multipled by the number of elements that should be processed.
11360 : : Insert any set-up statements before GSI. */
11361 : :
11362 : : tree
11363 : 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11364 : : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11365 : : unsigned int index, unsigned int factor)
11366 : : {
11367 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
11368 : 0 : bool use_bias_adjusted_len =
11369 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11370 : :
11371 : : /* Populate the rgroup's len array, if this is the first time we've
11372 : : used it. */
11373 : 0 : if (rgl->controls.is_empty ())
11374 : : {
11375 : 0 : rgl->controls.safe_grow_cleared (nvectors, true);
11376 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
11377 : : {
11378 : 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11379 : 0 : gcc_assert (len_type != NULL_TREE);
11380 : :
11381 : 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11382 : :
11383 : : /* Provide a dummy definition until the real one is available. */
11384 : 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11385 : 0 : rgl->controls[i] = len;
11386 : :
11387 : 0 : if (use_bias_adjusted_len)
11388 : : {
11389 : 0 : gcc_assert (i == 0);
11390 : 0 : tree adjusted_len =
11391 : 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11392 : 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11393 : 0 : rgl->bias_adjusted_ctrl = adjusted_len;
11394 : : }
11395 : : }
11396 : : }
11397 : :
11398 : 0 : if (use_bias_adjusted_len)
11399 : 0 : return rgl->bias_adjusted_ctrl;
11400 : :
11401 : 0 : tree loop_len = rgl->controls[index];
11402 : 0 : if (rgl->factor == 1 && factor == 1)
11403 : : {
11404 : 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11405 : 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11406 : 0 : if (maybe_ne (nunits1, nunits2))
11407 : : {
11408 : : /* A loop len for data type X can be reused for data type Y
11409 : : if X has N times more elements than Y and if Y's elements
11410 : : are N times bigger than X's. */
11411 : 0 : gcc_assert (multiple_p (nunits1, nunits2));
11412 : 0 : factor = exact_div (nunits1, nunits2).to_constant ();
11413 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11414 : 0 : gimple_seq seq = NULL;
11415 : 0 : loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11416 : 0 : build_int_cst (iv_type, factor));
11417 : 0 : if (seq)
11418 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11419 : : }
11420 : : }
11421 : : return loop_len;
11422 : : }
11423 : :
11424 : : /* Generate the tree for the loop len mask and return it. Given the lens,
11425 : : nvectors, vectype, index and factor to gen the len mask as below.
11426 : :
11427 : : tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
11428 : : */
11429 : : tree
11430 : 0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11431 : : gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
11432 : : unsigned int nvectors, tree vectype, tree stmt,
11433 : : unsigned int index, unsigned int factor)
11434 : : {
11435 : 0 : tree all_one_mask = build_all_ones_cst (vectype);
11436 : 0 : tree all_zero_mask = build_zero_cst (vectype);
11437 : 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
11438 : : factor);
11439 : 0 : tree bias = build_int_cst (intQI_type_node,
11440 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
11441 : 0 : tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
11442 : 0 : gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
11443 : : all_one_mask, all_zero_mask, len,
11444 : : bias);
11445 : 0 : gimple_call_set_lhs (call, len_mask);
11446 : 0 : gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
11447 : :
11448 : 0 : return len_mask;
11449 : : }
11450 : :
11451 : : /* Scale profiling counters by estimation for LOOP which is vectorized
11452 : : by factor VF.
11453 : : If FLAT is true, the loop we started with had unrealistically flat
11454 : : profile. */
11455 : :
11456 : : static void
11457 : 56872 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11458 : : {
11459 : : /* For flat profiles do not scale down proportionally by VF and only
11460 : : cap by known iteration count bounds. */
11461 : 56872 : if (flat)
11462 : : {
11463 : 32312 : if (dump_file && (dump_flags & TDF_DETAILS))
11464 : 4782 : fprintf (dump_file,
11465 : : "Vectorized loop profile seems flat; not scaling iteration "
11466 : : "count down by the vectorization factor %i\n", vf);
11467 : 32312 : scale_loop_profile (loop, profile_probability::always (),
11468 : : get_likely_max_loop_iterations_int (loop));
11469 : 32312 : return;
11470 : : }
11471 : : /* Loop body executes VF fewer times and exit increases VF times. */
11472 : 24560 : profile_count entry_count = loop_preheader_edge (loop)->count ();
11473 : :
11474 : : /* If we have unreliable loop profile avoid dropping entry
11475 : : count bellow header count. This can happen since loops
11476 : : has unrealistically low trip counts. */
11477 : 24560 : while (vf > 1
11478 : 25725 : && loop->header->count > entry_count
11479 : 52282 : && loop->header->count < entry_count * vf)
11480 : : {
11481 : 1997 : if (dump_file && (dump_flags & TDF_DETAILS))
11482 : 152 : fprintf (dump_file,
11483 : : "Vectorization factor %i seems too large for profile "
11484 : : "prevoiusly believed to be consistent; reducing.\n", vf);
11485 : 1997 : vf /= 2;
11486 : : }
11487 : :
11488 : 24560 : if (entry_count.nonzero_p ())
11489 : 24560 : set_edge_probability_and_rescale_others
11490 : 24560 : (exit_e,
11491 : 24560 : entry_count.probability_in (loop->header->count / vf));
11492 : : /* Avoid producing very large exit probability when we do not have
11493 : : sensible profile. */
11494 : 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
11495 : 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11496 : 24560 : loop->latch->count = single_pred_edge (loop->latch)->count ();
11497 : :
11498 : 24560 : scale_loop_profile (loop, profile_probability::always () / vf,
11499 : : get_likely_max_loop_iterations_int (loop));
11500 : : }
11501 : :
11502 : : /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11503 : : in the hash_map with its corresponding values. */
11504 : :
11505 : : static tree
11506 : 7414 : find_in_mapping (tree t, void *context)
11507 : : {
11508 : 7414 : hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11509 : :
11510 : 7414 : tree *value = mapping->get (t);
11511 : 7414 : return value ? *value : t;
11512 : : }
11513 : :
11514 : : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11515 : : original loop that has now been vectorized.
11516 : :
11517 : : The inits of the data_references need to be advanced with the number of
11518 : : iterations of the main loop. This has been computed in vect_do_peeling and
11519 : : is stored in parameter ADVANCE. We first restore the data_references
11520 : : initial offset with the values recored in ORIG_DRS_INIT.
11521 : :
11522 : : Since the loop_vec_info of this EPILOGUE was constructed for the original
11523 : : loop, its stmt_vec_infos all point to the original statements. These need
11524 : : to be updated to point to their corresponding copies as well as the SSA_NAMES
11525 : : in their PATTERN_DEF_SEQs and RELATED_STMTs.
11526 : :
11527 : : The data_reference's connections also need to be updated. Their
11528 : : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11529 : : stmt_vec_infos, their statements need to point to their corresponding copy,
11530 : : if they are gather loads or scatter stores then their reference needs to be
11531 : : updated to point to its corresponding copy. */
11532 : :
11533 : : static void
11534 : 6517 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11535 : : {
11536 : 6517 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11537 : 6517 : auto_vec<gimple *> stmt_worklist;
11538 : 6517 : hash_map<tree,tree> mapping;
11539 : 6517 : gimple *orig_stmt, *new_stmt;
11540 : 6517 : gimple_stmt_iterator epilogue_gsi;
11541 : 6517 : gphi_iterator epilogue_phi_gsi;
11542 : 6517 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11543 : 6517 : basic_block *epilogue_bbs = get_loop_body (epilogue);
11544 : 6517 : unsigned i;
11545 : :
11546 : 6517 : free (LOOP_VINFO_BBS (epilogue_vinfo));
11547 : 6517 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11548 : 6517 : LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
11549 : :
11550 : : /* The EPILOGUE loop is a copy of the original loop so they share the same
11551 : : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11552 : : point to the copied statements. We also create a mapping of all LHS' in
11553 : : the original loop and all the LHS' in the EPILOGUE and create worklists to
11554 : : update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11555 : 19551 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11556 : : {
11557 : 13034 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11558 : 33894 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11559 : : {
11560 : 20860 : new_stmt = epilogue_phi_gsi.phi ();
11561 : :
11562 : 20860 : gcc_assert (gimple_uid (new_stmt) > 0);
11563 : 20860 : stmt_vinfo
11564 : 20860 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11565 : :
11566 : 20860 : orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11567 : 20860 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11568 : :
11569 : 41720 : mapping.put (gimple_phi_result (orig_stmt),
11570 : 20860 : gimple_phi_result (new_stmt));
11571 : : /* PHI nodes can not have patterns or related statements. */
11572 : 20860 : gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11573 : : && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11574 : : }
11575 : :
11576 : 26068 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11577 : 132263 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11578 : : {
11579 : 119229 : new_stmt = gsi_stmt (epilogue_gsi);
11580 : 119229 : if (is_gimple_debug (new_stmt))
11581 : 21636 : continue;
11582 : :
11583 : 97593 : gcc_assert (gimple_uid (new_stmt) > 0);
11584 : 97593 : stmt_vinfo
11585 : 97593 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11586 : :
11587 : 97593 : orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11588 : 97593 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11589 : :
11590 : 97593 : if (tree old_lhs = gimple_get_lhs (orig_stmt))
11591 : 91009 : mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11592 : :
11593 : 97593 : if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11594 : : {
11595 : : gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11596 : : for (gimple_stmt_iterator gsi = gsi_start (seq);
11597 : 3957 : !gsi_end_p (gsi); gsi_next (&gsi))
11598 : 2524 : stmt_worklist.safe_push (gsi_stmt (gsi));
11599 : : }
11600 : :
11601 : 97593 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11602 : 97593 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11603 : : {
11604 : 1749 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11605 : 1749 : stmt_worklist.safe_push (stmt);
11606 : : /* Set BB such that the assert in
11607 : : 'get_initial_defs_for_reduction' is able to determine that
11608 : : the BB of the related stmt is inside this loop. */
11609 : 1749 : gimple_set_bb (stmt,
11610 : : gimple_bb (new_stmt));
11611 : 1749 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11612 : 1749 : gcc_assert (related_vinfo == NULL
11613 : : || related_vinfo == stmt_vinfo);
11614 : : }
11615 : : }
11616 : : }
11617 : :
11618 : : /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11619 : : using the original main loop and thus need to be updated to refer to the
11620 : : cloned variables used in the epilogue. */
11621 : 10790 : for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11622 : : {
11623 : 4273 : gimple *stmt = stmt_worklist[i];
11624 : 4273 : tree *new_op;
11625 : :
11626 : 11059 : for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11627 : : {
11628 : 6786 : tree op = gimple_op (stmt, j);
11629 : 6786 : if ((new_op = mapping.get(op)))
11630 : 1702 : gimple_set_op (stmt, j, *new_op);
11631 : : else
11632 : : {
11633 : : /* PR92429: The last argument of simplify_replace_tree disables
11634 : : folding when replacing arguments. This is required as
11635 : : otherwise you might end up with different statements than the
11636 : : ones analyzed in vect_loop_analyze, leading to different
11637 : : vectorization. */
11638 : 5084 : op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11639 : : &find_in_mapping, &mapping, false);
11640 : 5084 : gimple_set_op (stmt, j, op);
11641 : : }
11642 : : }
11643 : : }
11644 : :
11645 : 6517 : struct data_reference *dr;
11646 : 6517 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11647 : 28111 : FOR_EACH_VEC_ELT (datarefs, i, dr)
11648 : : {
11649 : 21594 : orig_stmt = DR_STMT (dr);
11650 : 21594 : gcc_assert (gimple_uid (orig_stmt) > 0);
11651 : 21594 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11652 : : /* Data references for gather loads and scatter stores do not use the
11653 : : updated offset we set using ADVANCE. Instead we have to make sure the
11654 : : reference in the data references point to the corresponding copy of
11655 : : the original in the epilogue. Make sure to update both
11656 : : gather/scatters recognized by dataref analysis and also other
11657 : : refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11658 : 21594 : auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11659 : 21594 : if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11660 : 21418 : || STMT_VINFO_STRIDED_P (vstmt_vinfo)
11661 : 19779 : || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11662 : : {
11663 : : /* ??? As we copy epilogues from the main loop incremental
11664 : : replacement from an already replaced DR_REF from vectorizing
11665 : : the first epilogue will fail. */
11666 : 2055 : DR_REF (dr)
11667 : 2055 : = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11668 : : &find_in_mapping, &mapping);
11669 : 2055 : DR_BASE_ADDRESS (dr)
11670 : 2055 : = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11671 : : &find_in_mapping, &mapping);
11672 : : }
11673 : 21594 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11674 : : }
11675 : :
11676 : : /* Advance data_reference's with the number of iterations of the previous
11677 : : loop and its prologue. */
11678 : 6517 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11679 : :
11680 : : /* Remember the advancement made. */
11681 : 6517 : LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
11682 : :
11683 : 6517 : epilogue_vinfo->shared->datarefs_copy.release ();
11684 : 6517 : epilogue_vinfo->shared->save_datarefs ();
11685 : 6517 : }
11686 : :
11687 : : /* When vectorizing early break statements instructions that happen before
11688 : : the early break in the current BB need to be moved to after the early
11689 : : break. This function deals with that and assumes that any validity
11690 : : checks has already been performed.
11691 : :
11692 : : While moving the instructions if it encounters a VUSE or VDEF it then
11693 : : corrects the VUSES as it moves the statements along. GDEST is the location
11694 : : in which to insert the new statements. */
11695 : :
11696 : : static void
11697 : 1411 : move_early_exit_stmts (loop_vec_info loop_vinfo)
11698 : : {
11699 : 1411 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
11700 : :
11701 : 1411 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11702 : 1207 : return;
11703 : :
11704 : : /* Move all stmts that need moving. */
11705 : 204 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11706 : 204 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
11707 : :
11708 : 204 : tree last_seen_vuse = NULL_TREE;
11709 : 503 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11710 : : {
11711 : : /* We have to update crossed degenerate virtual PHIs. Simply
11712 : : elide them. */
11713 : 299 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
11714 : : {
11715 : 7 : tree vdef = gimple_phi_result (vphi);
11716 : 7 : tree vuse = gimple_phi_arg_def (vphi, 0);
11717 : 7 : imm_use_iterator iter;
11718 : 7 : use_operand_p use_p;
11719 : 7 : gimple *use_stmt;
11720 : 23 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11721 : : {
11722 : 48 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11723 : 16 : SET_USE (use_p, vuse);
11724 : 7 : }
11725 : 7 : auto gsi = gsi_for_stmt (stmt);
11726 : 7 : remove_phi_node (&gsi, true);
11727 : 7 : last_seen_vuse = vuse;
11728 : 7 : continue;
11729 : 7 : }
11730 : :
11731 : : /* Check to see if statement is still required for vect or has been
11732 : : elided. */
11733 : 292 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11734 : 292 : if (!stmt_info)
11735 : 0 : continue;
11736 : :
11737 : 292 : if (dump_enabled_p ())
11738 : 147 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11739 : :
11740 : 292 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11741 : 292 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11742 : 584 : last_seen_vuse = gimple_vuse (stmt);
11743 : : }
11744 : :
11745 : : /* Update all the stmts with their new reaching VUSES. */
11746 : 628 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11747 : : {
11748 : 178 : if (dump_enabled_p ())
11749 : 142 : dump_printf_loc (MSG_NOTE, vect_location,
11750 : : "updating vuse to %T for load %G",
11751 : : last_seen_vuse, p);
11752 : 178 : gimple_set_vuse (p, last_seen_vuse);
11753 : 178 : update_stmt (p);
11754 : : }
11755 : :
11756 : : /* And update the LC PHIs on exits. */
11757 : 1026 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11758 : 414 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11759 : 218 : if (gphi *phi = get_virtual_phi (e->dest))
11760 : 422 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11761 : : }
11762 : :
11763 : : /* Function vect_transform_loop.
11764 : :
11765 : : The analysis phase has determined that the loop is vectorizable.
11766 : : Vectorize the loop - created vectorized stmts to replace the scalar
11767 : : stmts in the loop, and update the loop exit condition.
11768 : : Returns scalar epilogue loop if any. */
11769 : :
11770 : : class loop *
11771 : 56872 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11772 : : {
11773 : 56872 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11774 : 56872 : class loop *epilogue = NULL;
11775 : 56872 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11776 : 56872 : int nbbs = loop->num_nodes;
11777 : 56872 : int i;
11778 : 56872 : tree niters_vector = NULL_TREE;
11779 : 56872 : tree step_vector = NULL_TREE;
11780 : 56872 : tree niters_vector_mult_vf = NULL_TREE;
11781 : 56872 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11782 : 56872 : unsigned int lowest_vf = constant_lower_bound (vf);
11783 : 56872 : gimple *stmt;
11784 : 56872 : bool check_profitability = false;
11785 : 56872 : unsigned int th;
11786 : 56872 : bool flat = maybe_flat_loop_profile (loop);
11787 : :
11788 : 56872 : DUMP_VECT_SCOPE ("vec_transform_loop");
11789 : :
11790 : 56872 : loop_vinfo->shared->check_datarefs ();
11791 : :
11792 : : /* Use the more conservative vectorization threshold. If the number
11793 : : of iterations is constant assume the cost check has been performed
11794 : : by our caller. If the threshold makes all loops profitable that
11795 : : run at least the (estimated) vectorization factor number of times
11796 : : checking is pointless, too. */
11797 : 56872 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11798 : 56872 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11799 : : {
11800 : 17208 : if (dump_enabled_p ())
11801 : 158 : dump_printf_loc (MSG_NOTE, vect_location,
11802 : : "Profitability threshold is %d loop iterations.\n",
11803 : : th);
11804 : : check_profitability = true;
11805 : : }
11806 : :
11807 : : /* Make sure there exists a single-predecessor exit bb. Do this before
11808 : : versioning. */
11809 : 56872 : edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11810 : 56872 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11811 : : {
11812 : 11409 : split_loop_exit_edge (e, true);
11813 : 11409 : if (dump_enabled_p ())
11814 : 1876 : dump_printf (MSG_NOTE, "split exit edge\n");
11815 : : }
11816 : :
11817 : : /* Version the loop first, if required, so the profitability check
11818 : : comes first. */
11819 : :
11820 : 56872 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11821 : : {
11822 : 3608 : class loop *sloop
11823 : 3608 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11824 : 3608 : sloop->force_vectorize = false;
11825 : 3608 : check_profitability = false;
11826 : : }
11827 : :
11828 : : /* Make sure there exists a single-predecessor exit bb also on the
11829 : : scalar loop copy. Do this after versioning but before peeling
11830 : : so CFG structure is fine for both scalar and if-converted loop
11831 : : to make slpeel_duplicate_current_defs_from_edges face matched
11832 : : loop closed PHI nodes on the exit. */
11833 : 56872 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11834 : : {
11835 : 5902 : e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11836 : 5902 : if (! single_pred_p (e->dest))
11837 : : {
11838 : 5681 : split_loop_exit_edge (e, true);
11839 : 5681 : if (dump_enabled_p ())
11840 : 1051 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11841 : : }
11842 : : }
11843 : :
11844 : 56872 : tree niters = vect_build_loop_niters (loop_vinfo);
11845 : 56872 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11846 : 56872 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11847 : 56872 : bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11848 : 56872 : tree advance;
11849 : 56872 : drs_init_vec orig_drs_init;
11850 : :
11851 : 56872 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11852 : : &step_vector, &niters_vector_mult_vf, th,
11853 : : check_profitability, niters_no_overflow,
11854 : : &advance);
11855 : 56872 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11856 : 56872 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11857 : : {
11858 : : /* Ifcvt duplicates loop preheader, loop body and produces an basic
11859 : : block after loop exit. We need to scale all that. */
11860 : 85 : basic_block preheader
11861 : 85 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11862 : 85 : preheader->count
11863 : : = preheader->count.apply_probability
11864 : 85 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11865 : 85 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11866 : : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11867 : 85 : LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11868 : : }
11869 : :
11870 : 56872 : if (niters_vector == NULL_TREE)
11871 : : {
11872 : 24725 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11873 : 24725 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11874 : 50153 : && known_eq (lowest_vf, vf))
11875 : : {
11876 : 24722 : niters_vector
11877 : 24722 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11878 : 24722 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11879 : 24722 : step_vector = build_one_cst (TREE_TYPE (niters));
11880 : : }
11881 : 709 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11882 : 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11883 : : &step_vector, niters_no_overflow);
11884 : : else
11885 : : /* vect_do_peeling subtracted the number of peeled prologue
11886 : : iterations from LOOP_VINFO_NITERS. */
11887 : 708 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11888 : : &niters_vector, &step_vector,
11889 : : niters_no_overflow);
11890 : : }
11891 : :
11892 : : /* 1) Make sure the loop header has exactly two entries
11893 : : 2) Make sure we have a preheader basic block. */
11894 : :
11895 : 56872 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11896 : :
11897 : 56872 : split_edge (loop_preheader_edge (loop));
11898 : :
11899 : 56872 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11900 : : /* This will deal with any possible peeling. */
11901 : 1 : vect_prepare_for_masked_peels (loop_vinfo);
11902 : :
11903 : : /* Handle any code motion that we need to for early-break vectorization after
11904 : : we've done peeling but just before we start vectorizing. */
11905 : 56872 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11906 : 1411 : move_early_exit_stmts (loop_vinfo);
11907 : :
11908 : : /* Remove existing clobber stmts and prefetches. */
11909 : 173514 : for (i = 0; i < nbbs; i++)
11910 : : {
11911 : 116642 : basic_block bb = bbs[i];
11912 : 1010800 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
11913 : : {
11914 : 777516 : stmt = gsi_stmt (si);
11915 : 777516 : if (gimple_clobber_p (stmt)
11916 : 777516 : || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
11917 : : {
11918 : 211 : unlink_stmt_vdef (stmt);
11919 : 211 : gsi_remove (&si, true);
11920 : 211 : release_defs (stmt);
11921 : : }
11922 : : else
11923 : 777305 : gsi_next (&si);
11924 : : }
11925 : : }
11926 : :
11927 : : /* Schedule the SLP instances. */
11928 : 56872 : if (!loop_vinfo->slp_instances.is_empty ())
11929 : : {
11930 : 56872 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11931 : 56872 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11932 : : }
11933 : :
11934 : : /* Generate the loop invariant statements. */
11935 : 56872 : if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
11936 : : {
11937 : 74 : if (dump_enabled_p ())
11938 : 30 : dump_printf_loc (MSG_NOTE, vect_location,
11939 : : "------>generating loop invariant statements\n");
11940 : 74 : gimple_stmt_iterator gsi;
11941 : 74 : gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
11942 : 74 : gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
11943 : : GSI_CONTINUE_LINKING);
11944 : : }
11945 : :
11946 : : /* Stub out scalar statements that must not survive vectorization and
11947 : : were not picked as relevant in any SLP instance.
11948 : : Doing this here helps with grouped statements, or statements that
11949 : : are involved in patterns. */
11950 : 173514 : for (i = 0; i < nbbs; i++)
11951 : : {
11952 : 116642 : basic_block bb = bbs[i];
11953 : 116642 : stmt_vec_info stmt_info;
11954 : 233284 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11955 : 1517103 : !gsi_end_p (gsi); gsi_next (&gsi))
11956 : : {
11957 : 1400461 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11958 : 5195 : if (!call || !gimple_call_internal_p (call))
11959 : 1396403 : continue;
11960 : 4058 : internal_fn ifn = gimple_call_internal_fn (call);
11961 : 4058 : if (ifn == IFN_MASK_LOAD)
11962 : : {
11963 : 539 : tree lhs = gimple_get_lhs (call);
11964 : 539 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11965 : : {
11966 : 0 : tree zero = build_zero_cst (TREE_TYPE (lhs));
11967 : 0 : gimple *new_stmt = gimple_build_assign (lhs, zero);
11968 : 0 : gsi_replace (&gsi, new_stmt, true);
11969 : : }
11970 : : }
11971 : 3519 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11972 : : {
11973 : 1578 : tree lhs = gimple_get_lhs (call);
11974 : 1578 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11975 : : {
11976 : 0 : tree else_arg
11977 : 0 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11978 : 0 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11979 : 0 : gsi_replace (&gsi, new_stmt, true);
11980 : : }
11981 : : }
11982 : 1941 : else if (ifn == IFN_MASK_CALL
11983 : 4 : && (stmt_info = loop_vinfo->lookup_stmt (call))
11984 : 4 : && !STMT_VINFO_RELEVANT_P (stmt_info)
11985 : 1945 : && !STMT_VINFO_LIVE_P (stmt_info))
11986 : : {
11987 : 4 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11988 : 4 : loop_vinfo->remove_stmt (stmt_info);
11989 : : }
11990 : : }
11991 : : }
11992 : :
11993 : : /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11994 : : a zero NITERS becomes a nonzero NITERS_VECTOR. */
11995 : 56872 : if (integer_onep (step_vector))
11996 : 56862 : niters_no_overflow = true;
11997 : 56872 : vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11998 : : niters_vector, step_vector, niters_vector_mult_vf,
11999 : 56872 : !niters_no_overflow);
12000 : :
12001 : 56872 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12002 : :
12003 : : /* True if the final iteration might not handle a full vector's
12004 : : worth of scalar iterations. */
12005 : 113744 : bool final_iter_may_be_partial
12006 : 56872 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
12007 : 56872 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
12008 : :
12009 : : /* +1 to convert latch counts to loop iteration counts. */
12010 : 56872 : int bias_for_lowest = 1;
12011 : :
12012 : : /* When we are peeling for gaps then we take away one scalar iteration
12013 : : from the vector loop. Thus we can adjust the upper bound by one
12014 : : scalar iteration. But only when we know the bound applies to the
12015 : : IV exit test which might not be true when we have multiple exits. */
12016 : 56872 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12017 : 110587 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12018 : :
12019 : 56872 : int bias_for_assumed = bias_for_lowest;
12020 : 56872 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12021 : 56872 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12022 : : {
12023 : : /* When the amount of peeling is known at compile time, the first
12024 : : iteration will have exactly alignment_npeels active elements.
12025 : : In the worst case it will have at least one. */
12026 : 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12027 : 1 : bias_for_lowest += lowest_vf - min_first_active;
12028 : 1 : bias_for_assumed += assumed_vf - min_first_active;
12029 : : }
12030 : : /* In these calculations the "- 1" converts loop iteration counts
12031 : : back to latch counts. */
12032 : 56872 : if (loop->any_upper_bound)
12033 : : {
12034 : 56872 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12035 : 56872 : loop->nb_iterations_upper_bound
12036 : 56872 : = (final_iter_may_be_partial
12037 : 58293 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12038 : 2842 : lowest_vf) - 1
12039 : 55451 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12040 : 110902 : lowest_vf) - 1);
12041 : 56872 : if (main_vinfo
12042 : : /* Both peeling for alignment and peeling for gaps can end up
12043 : : with the scalar epilogue running for more than VF-1 iterations. */
12044 : 6517 : && !main_vinfo->peeling_for_alignment
12045 : 6469 : && !main_vinfo->peeling_for_gaps)
12046 : : {
12047 : 6313 : unsigned int bound;
12048 : 6313 : poly_uint64 main_iters
12049 : 6313 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12050 : : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12051 : 6313 : main_iters
12052 : 6313 : = upper_bound (main_iters,
12053 : 6313 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12054 : 12626 : if (can_div_away_from_zero_p (main_iters,
12055 : 6313 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12056 : : &bound))
12057 : 6313 : loop->nb_iterations_upper_bound
12058 : 6313 : = wi::umin ((bound_wide_int) (bound - 1),
12059 : 6313 : loop->nb_iterations_upper_bound);
12060 : : }
12061 : : }
12062 : 56872 : if (loop->any_likely_upper_bound)
12063 : 56872 : loop->nb_iterations_likely_upper_bound
12064 : 56872 : = (final_iter_may_be_partial
12065 : 58293 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12066 : 1421 : + bias_for_lowest, lowest_vf) - 1
12067 : 55451 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12068 : 56872 : + bias_for_lowest, lowest_vf) - 1);
12069 : 56872 : if (loop->any_estimate)
12070 : 32587 : loop->nb_iterations_estimate
12071 : 32587 : = (final_iter_may_be_partial
12072 : 33373 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12073 : 1572 : assumed_vf) - 1
12074 : 31801 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12075 : 64388 : assumed_vf) - 1);
12076 : 56872 : scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12077 : : assumed_vf, flat);
12078 : :
12079 : 56872 : if (dump_enabled_p ())
12080 : : {
12081 : 10032 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12082 : : {
12083 : 8753 : dump_printf_loc (MSG_NOTE, vect_location,
12084 : : "LOOP VECTORIZED\n");
12085 : 8753 : if (loop->inner)
12086 : 266 : dump_printf_loc (MSG_NOTE, vect_location,
12087 : : "OUTER LOOP VECTORIZED\n");
12088 : 8753 : dump_printf (MSG_NOTE, "\n");
12089 : : }
12090 : : else
12091 : 1279 : dump_printf_loc (MSG_NOTE, vect_location,
12092 : : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12093 : 1279 : GET_MODE_NAME (loop_vinfo->vector_mode));
12094 : : }
12095 : :
12096 : : /* Loops vectorized with a variable factor won't benefit from
12097 : : unrolling/peeling. */
12098 : 56872 : if (!vf.is_constant ())
12099 : : {
12100 : : loop->unroll = 1;
12101 : : if (dump_enabled_p ())
12102 : : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12103 : : " variable-length vectorization factor\n");
12104 : : }
12105 : :
12106 : : /* When we have unrolled the loop due to a user requested value we should
12107 : : leave it up to the RTL unroll heuristics to determine if it's still worth
12108 : : while to unroll more. */
12109 : 56872 : if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
12110 : 22 : loop->unroll = 0;
12111 : :
12112 : : /* Free SLP instances here because otherwise stmt reference counting
12113 : : won't work. */
12114 : : slp_instance instance;
12115 : 143863 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12116 : 86991 : vect_free_slp_instance (instance);
12117 : 56872 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12118 : : /* Clear-up safelen field since its value is invalid after vectorization
12119 : : since vectorized loop can have loop-carried dependencies. */
12120 : 56872 : loop->safelen = 0;
12121 : :
12122 : 56872 : if (epilogue)
12123 : : {
12124 : : /* Accumulate past advancements made. */
12125 : 6517 : if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
12126 : 77 : advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
12127 : : LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
12128 : : advance);
12129 : 6517 : update_epilogue_loop_vinfo (epilogue, advance);
12130 : :
12131 : 6517 : epilogue->simduid = loop->simduid;
12132 : 6517 : epilogue->force_vectorize = loop->force_vectorize;
12133 : 6517 : epilogue->dont_vectorize = false;
12134 : : }
12135 : :
12136 : 56872 : return epilogue;
12137 : 56872 : }
12138 : :
12139 : : /* The code below is trying to perform simple optimization - revert
12140 : : if-conversion for masked stores, i.e. if the mask of a store is zero
12141 : : do not perform it and all stored value producers also if possible.
12142 : : For example,
12143 : : for (i=0; i<n; i++)
12144 : : if (c[i])
12145 : : {
12146 : : p1[i] += 1;
12147 : : p2[i] = p3[i] +2;
12148 : : }
12149 : : this transformation will produce the following semi-hammock:
12150 : :
12151 : : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12152 : : {
12153 : : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12154 : : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12155 : : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12156 : : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12157 : : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12158 : : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12159 : : }
12160 : : */
12161 : :
12162 : : void
12163 : 444 : optimize_mask_stores (class loop *loop)
12164 : : {
12165 : 444 : basic_block *bbs = get_loop_body (loop);
12166 : 444 : unsigned nbbs = loop->num_nodes;
12167 : 444 : unsigned i;
12168 : 444 : basic_block bb;
12169 : 444 : class loop *bb_loop;
12170 : 444 : gimple_stmt_iterator gsi;
12171 : 444 : gimple *stmt;
12172 : 444 : auto_vec<gimple *> worklist;
12173 : 444 : auto_purge_vect_location sentinel;
12174 : :
12175 : 444 : vect_location = find_loop_location (loop);
12176 : : /* Pick up all masked stores in loop if any. */
12177 : 1776 : for (i = 0; i < nbbs; i++)
12178 : : {
12179 : 888 : bb = bbs[i];
12180 : 14342 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12181 : 12566 : gsi_next (&gsi))
12182 : : {
12183 : 12566 : stmt = gsi_stmt (gsi);
12184 : 12566 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12185 : 468 : worklist.safe_push (stmt);
12186 : : }
12187 : : }
12188 : :
12189 : 444 : free (bbs);
12190 : 444 : if (worklist.is_empty ())
12191 : 68 : return;
12192 : :
12193 : : /* Loop has masked stores. */
12194 : 827 : while (!worklist.is_empty ())
12195 : : {
12196 : 451 : gimple *last, *last_store;
12197 : 451 : edge e, efalse;
12198 : 451 : tree mask;
12199 : 451 : basic_block store_bb, join_bb;
12200 : 451 : gimple_stmt_iterator gsi_to;
12201 : 451 : tree vdef, new_vdef;
12202 : 451 : gphi *phi;
12203 : 451 : tree vectype;
12204 : 451 : tree zero;
12205 : :
12206 : 451 : last = worklist.pop ();
12207 : 451 : mask = gimple_call_arg (last, 2);
12208 : 451 : bb = gimple_bb (last);
12209 : : /* Create then_bb and if-then structure in CFG, then_bb belongs to
12210 : : the same loop as if_bb. It could be different to LOOP when two
12211 : : level loop-nest is vectorized and mask_store belongs to the inner
12212 : : one. */
12213 : 451 : e = split_block (bb, last);
12214 : 451 : bb_loop = bb->loop_father;
12215 : 451 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12216 : 451 : join_bb = e->dest;
12217 : 451 : store_bb = create_empty_bb (bb);
12218 : 451 : add_bb_to_loop (store_bb, bb_loop);
12219 : 451 : e->flags = EDGE_TRUE_VALUE;
12220 : 451 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12221 : : /* Put STORE_BB to likely part. */
12222 : 451 : efalse->probability = profile_probability::likely ();
12223 : 451 : e->probability = efalse->probability.invert ();
12224 : 451 : store_bb->count = efalse->count ();
12225 : 451 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12226 : 451 : if (dom_info_available_p (CDI_DOMINATORS))
12227 : 451 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12228 : 451 : if (dump_enabled_p ())
12229 : 165 : dump_printf_loc (MSG_NOTE, vect_location,
12230 : : "Create new block %d to sink mask stores.",
12231 : : store_bb->index);
12232 : : /* Create vector comparison with boolean result. */
12233 : 451 : vectype = TREE_TYPE (mask);
12234 : 451 : zero = build_zero_cst (vectype);
12235 : 451 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12236 : 451 : gsi = gsi_last_bb (bb);
12237 : 451 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12238 : : /* Create new PHI node for vdef of the last masked store:
12239 : : .MEM_2 = VDEF <.MEM_1>
12240 : : will be converted to
12241 : : .MEM.3 = VDEF <.MEM_1>
12242 : : and new PHI node will be created in join bb
12243 : : .MEM_2 = PHI <.MEM_1, .MEM_3>
12244 : : */
12245 : 451 : vdef = gimple_vdef (last);
12246 : 451 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
12247 : 451 : gimple_set_vdef (last, new_vdef);
12248 : 451 : phi = create_phi_node (vdef, join_bb);
12249 : 451 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12250 : :
12251 : : /* Put all masked stores with the same mask to STORE_BB if possible. */
12252 : 485 : while (true)
12253 : : {
12254 : 468 : gimple_stmt_iterator gsi_from;
12255 : 468 : gimple *stmt1 = NULL;
12256 : :
12257 : : /* Move masked store to STORE_BB. */
12258 : 468 : last_store = last;
12259 : 468 : gsi = gsi_for_stmt (last);
12260 : 468 : gsi_from = gsi;
12261 : : /* Shift GSI to the previous stmt for further traversal. */
12262 : 468 : gsi_prev (&gsi);
12263 : 468 : gsi_to = gsi_start_bb (store_bb);
12264 : 468 : gsi_move_before (&gsi_from, &gsi_to);
12265 : : /* Setup GSI_TO to the non-empty block start. */
12266 : 468 : gsi_to = gsi_start_bb (store_bb);
12267 : 468 : if (dump_enabled_p ())
12268 : 181 : dump_printf_loc (MSG_NOTE, vect_location,
12269 : : "Move stmt to created bb\n%G", last);
12270 : : /* Move all stored value producers if possible. */
12271 : 3840 : while (!gsi_end_p (gsi))
12272 : : {
12273 : 3839 : tree lhs;
12274 : 3839 : imm_use_iterator imm_iter;
12275 : 3839 : use_operand_p use_p;
12276 : 3839 : bool res;
12277 : :
12278 : : /* Skip debug statements. */
12279 : 3839 : if (is_gimple_debug (gsi_stmt (gsi)))
12280 : : {
12281 : 1 : gsi_prev (&gsi);
12282 : 2527 : continue;
12283 : : }
12284 : 3838 : stmt1 = gsi_stmt (gsi);
12285 : : /* Do not consider statements writing to memory or having
12286 : : volatile operand. */
12287 : 7628 : if (gimple_vdef (stmt1)
12288 : 7628 : || gimple_has_volatile_ops (stmt1))
12289 : : break;
12290 : 3790 : gsi_from = gsi;
12291 : 3790 : gsi_prev (&gsi);
12292 : 3790 : lhs = gimple_get_lhs (stmt1);
12293 : 3790 : if (!lhs)
12294 : : break;
12295 : :
12296 : : /* LHS of vectorized stmt must be SSA_NAME. */
12297 : 3790 : if (TREE_CODE (lhs) != SSA_NAME)
12298 : : break;
12299 : :
12300 : 3790 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12301 : : {
12302 : : /* Remove dead scalar statement. */
12303 : 2694 : if (has_zero_uses (lhs))
12304 : : {
12305 : 2526 : gsi_remove (&gsi_from, true);
12306 : 2526 : release_defs (stmt1);
12307 : 2526 : continue;
12308 : : }
12309 : : }
12310 : :
12311 : : /* Check that LHS does not have uses outside of STORE_BB. */
12312 : 1264 : res = true;
12313 : 2195 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12314 : : {
12315 : 1350 : gimple *use_stmt;
12316 : 1350 : use_stmt = USE_STMT (use_p);
12317 : 1350 : if (is_gimple_debug (use_stmt))
12318 : 0 : continue;
12319 : 1350 : if (gimple_bb (use_stmt) != store_bb)
12320 : : {
12321 : : res = false;
12322 : : break;
12323 : : }
12324 : : }
12325 : 1264 : if (!res)
12326 : : break;
12327 : :
12328 : 845 : if (gimple_vuse (stmt1)
12329 : 1273 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
12330 : : break;
12331 : :
12332 : : /* Can move STMT1 to STORE_BB. */
12333 : 845 : if (dump_enabled_p ())
12334 : 385 : dump_printf_loc (MSG_NOTE, vect_location,
12335 : : "Move stmt to created bb\n%G", stmt1);
12336 : 845 : gsi_move_before (&gsi_from, &gsi_to);
12337 : : /* Shift GSI_TO for further insertion. */
12338 : 1690 : gsi_prev (&gsi_to);
12339 : : }
12340 : : /* Put other masked stores with the same mask to STORE_BB. */
12341 : 468 : if (worklist.is_empty ()
12342 : 92 : || gimple_call_arg (worklist.last (), 2) != mask
12343 : 17 : || worklist.last () != stmt1)
12344 : : break;
12345 : 17 : last = worklist.pop ();
12346 : 17 : }
12347 : 902 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12348 : : }
12349 : 444 : }
12350 : :
12351 : : /* Decide whether it is possible to use a zero-based induction variable
12352 : : when vectorizing LOOP_VINFO with partial vectors. If it is, return
12353 : : the value that the induction variable must be able to hold in order
12354 : : to ensure that the rgroups eventually have no active vector elements.
12355 : : Return -1 otherwise. */
12356 : :
12357 : : widest_int
12358 : 46 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12359 : : {
12360 : 46 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12361 : 46 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12362 : 46 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12363 : :
12364 : : /* Calculate the value that the induction variable must be able
12365 : : to hit in order to ensure that we end the loop with an all-false mask.
12366 : : This involves adding the maximum number of inactive trailing scalar
12367 : : iterations. */
12368 : 46 : widest_int iv_limit = -1;
12369 : 46 : if (max_loop_iterations (loop, &iv_limit))
12370 : : {
12371 : 46 : if (niters_skip)
12372 : : {
12373 : : /* Add the maximum number of skipped iterations to the
12374 : : maximum iteration count. */
12375 : 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
12376 : 0 : iv_limit += wi::to_widest (niters_skip);
12377 : : else
12378 : 0 : iv_limit += max_vf - 1;
12379 : : }
12380 : 46 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12381 : : /* Make a conservatively-correct assumption. */
12382 : 2 : iv_limit += max_vf - 1;
12383 : :
12384 : : /* IV_LIMIT is the maximum number of latch iterations, which is also
12385 : : the maximum in-range IV value. Round this value down to the previous
12386 : : vector alignment boundary and then add an extra full iteration. */
12387 : 46 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12388 : 46 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12389 : : }
12390 : 46 : return iv_limit;
12391 : : }
12392 : :
12393 : : /* For the given rgroup_controls RGC, check whether an induction variable
12394 : : would ever hit a value that produces a set of all-false masks or zero
12395 : : lengths before wrapping around. Return true if it's possible to wrap
12396 : : around before hitting the desirable value, otherwise return false. */
12397 : :
12398 : : bool
12399 : 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12400 : : {
12401 : 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12402 : :
12403 : 0 : if (iv_limit == -1)
12404 : : return true;
12405 : :
12406 : 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12407 : 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
12408 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12409 : :
12410 : 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12411 : : return true;
12412 : :
12413 : : return false;
12414 : 0 : }
|