Branch data Line data Source code
1 : : /* Loop Vectorization
2 : : Copyright (C) 2003-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : : Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #define INCLUDE_ALGORITHM
23 : : #include "config.h"
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "cfghooks.h"
32 : : #include "tree-pass.h"
33 : : #include "ssa.h"
34 : : #include "optabs-tree.h"
35 : : #include "memmodel.h"
36 : : #include "optabs.h"
37 : : #include "diagnostic-core.h"
38 : : #include "fold-const.h"
39 : : #include "stor-layout.h"
40 : : #include "cfganal.h"
41 : : #include "gimplify.h"
42 : : #include "gimple-iterator.h"
43 : : #include "gimplify-me.h"
44 : : #include "tree-ssa-loop-ivopts.h"
45 : : #include "tree-ssa-loop-manip.h"
46 : : #include "tree-ssa-loop-niter.h"
47 : : #include "tree-ssa-loop.h"
48 : : #include "cfgloop.h"
49 : : #include "tree-scalar-evolution.h"
50 : : #include "tree-vectorizer.h"
51 : : #include "gimple-fold.h"
52 : : #include "cgraph.h"
53 : : #include "tree-cfg.h"
54 : : #include "tree-if-conv.h"
55 : : #include "internal-fn.h"
56 : : #include "tree-vector-builder.h"
57 : : #include "vec-perm-indices.h"
58 : : #include "tree-eh.h"
59 : : #include "case-cfn-macros.h"
60 : : #include "langhooks.h"
61 : :
62 : : /* Loop Vectorization Pass.
63 : :
64 : : This pass tries to vectorize loops.
65 : :
66 : : For example, the vectorizer transforms the following simple loop:
67 : :
68 : : short a[N]; short b[N]; short c[N]; int i;
69 : :
70 : : for (i=0; i<N; i++){
71 : : a[i] = b[i] + c[i];
72 : : }
73 : :
74 : : as if it was manually vectorized by rewriting the source code into:
75 : :
76 : : typedef int __attribute__((mode(V8HI))) v8hi;
77 : : short a[N]; short b[N]; short c[N]; int i;
78 : : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 : : v8hi va, vb, vc;
80 : :
81 : : for (i=0; i<N/8; i++){
82 : : vb = pb[i];
83 : : vc = pc[i];
84 : : va = vb + vc;
85 : : pa[i] = va;
86 : : }
87 : :
88 : : The main entry to this pass is vectorize_loops(), in which
89 : : the vectorizer applies a set of analyses on a given set of loops,
90 : : followed by the actual vectorization transformation for the loops that
91 : : had successfully passed the analysis phase.
92 : : Throughout this pass we make a distinction between two types of
93 : : data: scalars (which are represented by SSA_NAMES), and memory references
94 : : ("data-refs"). These two types of data require different handling both
95 : : during analysis and transformation. The types of data-refs that the
96 : : vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 : : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 : : accesses are required to have a simple (consecutive) access pattern.
99 : :
100 : : Analysis phase:
101 : : ===============
102 : : The driver for the analysis phase is vect_analyze_loop().
103 : : It applies a set of analyses, some of which rely on the scalar evolution
104 : : analyzer (scev) developed by Sebastian Pop.
105 : :
106 : : During the analysis phase the vectorizer records some information
107 : : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 : : loop, as well as general information about the loop as a whole, which is
109 : : recorded in a "loop_vec_info" struct attached to each loop.
110 : :
111 : : Transformation phase:
112 : : =====================
113 : : The loop transformation phase scans all the stmts in the loop, and
114 : : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 : : the loop that needs to be vectorized. It inserts the vector code sequence
116 : : just before the scalar stmt S, and records a pointer to the vector code
117 : : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 : : attached to S). This pointer will be used for the vectorization of following
119 : : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 : : otherwise, we rely on dead code elimination for removing it.
121 : :
122 : : For example, say stmt S1 was vectorized into stmt VS1:
123 : :
124 : : VS1: vb = px[i];
125 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 : : S2: a = b;
127 : :
128 : : To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 : : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 : : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 : : resulting sequence would be:
132 : :
133 : : VS1: vb = px[i];
134 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 : : VS2: va = vb;
136 : : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
137 : :
138 : : Operands that are not SSA_NAMEs, are data-refs that appear in
139 : : load/store operations (like 'x[i]' in S1), and are handled differently.
140 : :
141 : : Target modeling:
142 : : =================
143 : : Currently the only target specific information that is used is the
144 : : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 : : Targets that can support different sizes of vectors, for now will need
146 : : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 : : flexibility will be added in the future.
148 : :
149 : : Since we only vectorize operations which vector form can be
150 : : expressed using existing tree codes, to verify that an operation is
151 : : supported, the vectorizer checks the relevant optab at the relevant
152 : : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 : : the value found is CODE_FOR_nothing, then there's no target support, and
154 : : we can't vectorize the stmt.
155 : :
156 : : For additional information on this project see:
157 : : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
158 : : */
159 : :
160 : : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 : : unsigned *);
162 : : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 : : bool *, bool *, bool);
164 : :
165 : : /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 : : statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 : : may already be set for general statements (not just data refs). */
168 : :
169 : : static opt_result
170 : 3021948 : vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 : : bool vectype_maybe_set_p,
172 : : poly_uint64 *vf)
173 : : {
174 : 3021948 : gimple *stmt = stmt_info->stmt;
175 : :
176 : 3021948 : if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 : 1459050 : && !STMT_VINFO_LIVE_P (stmt_info))
178 : 3022031 : || gimple_clobber_p (stmt))
179 : : {
180 : 1458967 : if (dump_enabled_p ())
181 : 118132 : dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 : 1458967 : return opt_result::success ();
183 : : }
184 : :
185 : 1562981 : tree stmt_vectype, nunits_vectype;
186 : 1562981 : opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 : : &stmt_vectype,
188 : : &nunits_vectype);
189 : 1562981 : if (!res)
190 : 2116 : return res;
191 : :
192 : 1560865 : if (stmt_vectype)
193 : : {
194 : 1560839 : if (STMT_VINFO_VECTYPE (stmt_info))
195 : : /* The only case when a vectype had been already set is for stmts
196 : : that contain a data ref, or for "pattern-stmts" (stmts generated
197 : : by the vectorizer to represent/replace a certain idiom). */
198 : 924114 : gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 : : || vectype_maybe_set_p)
200 : : && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 : : else
202 : 636725 : STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
203 : : }
204 : :
205 : 1560865 : if (nunits_vectype)
206 : 1560839 : vect_update_max_nunits (vf, nunits_vectype);
207 : :
208 : 1560865 : return opt_result::success ();
209 : : }
210 : :
211 : : /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 : : types of STMT_INFO and all attached pattern statements and update
213 : : the vectorization factor VF accordingly. Return true on success
214 : : or false if something prevented vectorization. */
215 : :
216 : : static opt_result
217 : 2477980 : vect_determine_vf_for_stmt (vec_info *vinfo,
218 : : stmt_vec_info stmt_info, poly_uint64 *vf)
219 : : {
220 : 2477980 : if (dump_enabled_p ())
221 : 210729 : dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 : : stmt_info->stmt);
223 : 2477980 : opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 : 2477980 : if (!res)
225 : 2116 : return res;
226 : :
227 : 2475864 : if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 : 263711 : && STMT_VINFO_RELATED_STMT (stmt_info))
229 : : {
230 : 263711 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 : 263711 : stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
232 : :
233 : : /* If a pattern statement has def stmts, analyze them too. */
234 : 263711 : for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 : 543968 : !gsi_end_p (si); gsi_next (&si))
236 : : {
237 : 280257 : stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 : 280257 : if (dump_enabled_p ())
239 : 20345 : dump_printf_loc (MSG_NOTE, vect_location,
240 : : "==> examining pattern def stmt: %G",
241 : : def_stmt_info->stmt);
242 : 280257 : res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 : 280257 : if (!res)
244 : 0 : return res;
245 : : }
246 : :
247 : 263711 : if (dump_enabled_p ())
248 : 16250 : dump_printf_loc (MSG_NOTE, vect_location,
249 : : "==> examining pattern statement: %G",
250 : : stmt_info->stmt);
251 : 263711 : res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 : 263711 : if (!res)
253 : 0 : return res;
254 : : }
255 : :
256 : 2475864 : return opt_result::success ();
257 : : }
258 : :
259 : : /* Function vect_determine_vectorization_factor
260 : :
261 : : Determine the vectorization factor (VF). VF is the number of data elements
262 : : that are operated upon in parallel in a single iteration of the vectorized
263 : : loop. For example, when vectorizing a loop that operates on 4byte elements,
264 : : on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 : : elements can fit in a single vector register.
266 : :
267 : : We currently support vectorization of loops in which all types operated upon
268 : : are of the same size. Therefore this function currently sets VF according to
269 : : the size of the types operated upon, and fails if there are multiple sizes
270 : : in the loop.
271 : :
272 : : VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 : : original loop:
274 : : for (i=0; i<N; i++){
275 : : a[i] = b[i] + c[i];
276 : : }
277 : :
278 : : vectorized loop:
279 : : for (i=0; i<N; i+=VF){
280 : : a[i:VF] = b[i:VF] + c[i:VF];
281 : : }
282 : : */
283 : :
284 : : static opt_result
285 : 260910 : vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
286 : : {
287 : 260910 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 : 260910 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 : 260910 : unsigned nbbs = loop->num_nodes;
290 : 260910 : poly_uint64 vectorization_factor = 1;
291 : 260910 : tree scalar_type = NULL_TREE;
292 : 260910 : gphi *phi;
293 : 260910 : tree vectype;
294 : 260910 : stmt_vec_info stmt_info;
295 : 260910 : unsigned i;
296 : :
297 : 260910 : DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
298 : :
299 : 862052 : for (i = 0; i < nbbs; i++)
300 : : {
301 : 618005 : basic_block bb = bbs[i];
302 : :
303 : 1255413 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 : 637408 : gsi_next (&si))
305 : : {
306 : 652155 : phi = si.phi ();
307 : 652155 : stmt_info = loop_vinfo->lookup_stmt (phi);
308 : 652155 : if (dump_enabled_p ())
309 : 52089 : dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 : : (gimple *) phi);
311 : :
312 : 652155 : gcc_assert (stmt_info);
313 : :
314 : 652155 : if (STMT_VINFO_RELEVANT_P (stmt_info)
315 : 377845 : || STMT_VINFO_LIVE_P (stmt_info))
316 : : {
317 : 274310 : gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 : 274310 : scalar_type = TREE_TYPE (PHI_RESULT (phi));
319 : :
320 : 274310 : if (dump_enabled_p ())
321 : 10700 : dump_printf_loc (MSG_NOTE, vect_location,
322 : : "get vectype for scalar type: %T\n",
323 : : scalar_type);
324 : :
325 : 274310 : vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 : 274310 : if (!vectype)
327 : 14747 : return opt_result::failure_at (phi,
328 : : "not vectorized: unsupported "
329 : : "data-type %T\n",
330 : : scalar_type);
331 : 259563 : STMT_VINFO_VECTYPE (stmt_info) = vectype;
332 : :
333 : 259563 : if (dump_enabled_p ())
334 : 10636 : dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 : : vectype);
336 : :
337 : 259563 : if (dump_enabled_p ())
338 : : {
339 : 10636 : dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 : 10636 : dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 : 10636 : dump_printf (MSG_NOTE, "\n");
342 : : }
343 : :
344 : 259563 : vect_update_max_nunits (&vectorization_factor, vectype);
345 : : }
346 : : }
347 : :
348 : 4645417 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 : 3438901 : gsi_next (&si))
350 : : {
351 : 3441017 : if (is_gimple_debug (gsi_stmt (si)))
352 : 963037 : continue;
353 : 2477980 : stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 : 2477980 : opt_result res
355 : 2477980 : = vect_determine_vf_for_stmt (loop_vinfo,
356 : : stmt_info, &vectorization_factor);
357 : 2477980 : if (!res)
358 : 2116 : return res;
359 : : }
360 : : }
361 : :
362 : : /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 : 244047 : if (dump_enabled_p ())
364 : : {
365 : 16984 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 : 16984 : dump_dec (MSG_NOTE, vectorization_factor);
367 : 16984 : dump_printf (MSG_NOTE, "\n");
368 : : }
369 : :
370 : 244047 : if (known_le (vectorization_factor, 1U))
371 : 31706 : return opt_result::failure_at (vect_location,
372 : : "not vectorized: unsupported data-type\n");
373 : 212341 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 : 212341 : return opt_result::success ();
375 : : }
376 : :
377 : :
378 : : /* Function vect_is_simple_iv_evolution.
379 : :
380 : : FORNOW: A simple evolution of an induction variables in the loop is
381 : : considered a polynomial evolution. */
382 : :
383 : : static bool
384 : 616225 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 : : tree * step)
386 : : {
387 : 616225 : tree init_expr;
388 : 616225 : tree step_expr;
389 : 616225 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 : 616225 : basic_block bb;
391 : :
392 : : /* When there is no evolution in this loop, the evolution function
393 : : is not "simple". */
394 : 616225 : if (evolution_part == NULL_TREE)
395 : : return false;
396 : :
397 : : /* When the evolution is a polynomial of degree >= 2
398 : : the evolution function is not "simple". */
399 : 663887 : if (tree_is_chrec (evolution_part))
400 : : return false;
401 : :
402 : 569923 : step_expr = evolution_part;
403 : 569923 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
404 : :
405 : 569923 : if (dump_enabled_p ())
406 : 38914 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 : : step_expr, init_expr);
408 : :
409 : 569923 : *init = init_expr;
410 : 569923 : *step = step_expr;
411 : :
412 : 569923 : if (TREE_CODE (step_expr) != INTEGER_CST
413 : 53988 : && (TREE_CODE (step_expr) != SSA_NAME
414 : 47043 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 : 46811 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 : 6353 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 : 117 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 : 117 : || !flag_associative_math)))
419 : 617645 : && (TREE_CODE (step_expr) != REAL_CST
420 : 510 : || !flag_associative_math))
421 : : {
422 : 47662 : if (dump_enabled_p ())
423 : 3042 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 : : "step unknown.\n");
425 : 47662 : return false;
426 : : }
427 : :
428 : : return true;
429 : : }
430 : :
431 : : /* Function vect_is_nonlinear_iv_evolution
432 : :
433 : : Only support nonlinear induction for integer type
434 : : 1. neg
435 : : 2. mul by constant
436 : : 3. lshift/rshift by constant.
437 : :
438 : : For neg induction, return a fake step as integer -1. */
439 : : static bool
440 : 91799 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 : : gphi* loop_phi_node, tree *init, tree *step)
442 : : {
443 : 91799 : tree init_expr, ev_expr, result, op1, op2;
444 : 91799 : gimple* def;
445 : :
446 : 91799 : if (gimple_phi_num_args (loop_phi_node) != 2)
447 : : return false;
448 : :
449 : 91799 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 : 91799 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
451 : :
452 : : /* Support nonlinear induction only for integer type. */
453 : 91799 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 : : return false;
455 : :
456 : 66043 : *init = init_expr;
457 : 66043 : result = PHI_RESULT (loop_phi_node);
458 : :
459 : 66043 : if (TREE_CODE (ev_expr) != SSA_NAME
460 : 63974 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 : 66043 : || !is_gimple_assign (def))
462 : : return false;
463 : :
464 : 59577 : enum tree_code t_code = gimple_assign_rhs_code (def);
465 : 59577 : switch (t_code)
466 : : {
467 : 1529 : case NEGATE_EXPR:
468 : 1529 : if (gimple_assign_rhs1 (def) != result)
469 : : return false;
470 : 1529 : *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 : 1529 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 : 1529 : break;
473 : :
474 : 8213 : case RSHIFT_EXPR:
475 : 8213 : case LSHIFT_EXPR:
476 : 8213 : case MULT_EXPR:
477 : 8213 : op1 = gimple_assign_rhs1 (def);
478 : 8213 : op2 = gimple_assign_rhs2 (def);
479 : 8213 : if (TREE_CODE (op2) != INTEGER_CST
480 : 4802 : || op1 != result)
481 : : return false;
482 : 4688 : *step = op2;
483 : 4688 : if (t_code == LSHIFT_EXPR)
484 : 188 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 : 4500 : else if (t_code == RSHIFT_EXPR)
486 : 3831 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 : : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 : : else
489 : 669 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 : : break;
491 : :
492 : : default:
493 : : return false;
494 : : }
495 : :
496 : 6217 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 : 6217 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
498 : :
499 : 6217 : return true;
500 : : }
501 : :
502 : : /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 : : what we are assuming is a double reduction. For example, given
504 : : a structure like this:
505 : :
506 : : outer1:
507 : : x_1 = PHI <x_4(outer2), ...>;
508 : : ...
509 : :
510 : : inner:
511 : : x_2 = PHI <x_1(outer1), ...>;
512 : : ...
513 : : x_3 = ...;
514 : : ...
515 : :
516 : : outer2:
517 : : x_4 = PHI <x_3(inner)>;
518 : : ...
519 : :
520 : : outer loop analysis would treat x_1 as a double reduction phi and
521 : : this function would then return true for x_2. */
522 : :
523 : : static bool
524 : 616944 : vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
525 : : {
526 : 616944 : use_operand_p use_p;
527 : 616944 : ssa_op_iter op_iter;
528 : 1850087 : FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 : 1233862 : if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 : 616277 : if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 : : return true;
532 : : return false;
533 : : }
534 : :
535 : : /* Returns true if Phi is a first-order recurrence. A first-order
536 : : recurrence is a non-reduction recurrence relation in which the value of
537 : : the recurrence in the current loop iteration equals a value defined in
538 : : the previous iteration. */
539 : :
540 : : static bool
541 : 20805 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 : : gphi *phi)
543 : : {
544 : : /* A nested cycle isn't vectorizable as first order recurrence. */
545 : 20805 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 : : return false;
547 : :
548 : : /* Ensure the loop latch definition is from within the loop. */
549 : 20673 : edge latch = loop_latch_edge (loop);
550 : 20673 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 : 20673 : if (TREE_CODE (ldef) != SSA_NAME
552 : 18329 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 : 18301 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 : 37822 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 : 3834 : return false;
556 : :
557 : 16839 : tree def = gimple_phi_result (phi);
558 : :
559 : : /* Ensure every use_stmt of the phi node is dominated by the latch
560 : : definition. */
561 : 16839 : imm_use_iterator imm_iter;
562 : 16839 : use_operand_p use_p;
563 : 18928 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 : 18592 : if (!is_gimple_debug (USE_STMT (use_p))
565 : 36178 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 : 10614 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 : : USE_STMT (use_p))))
568 : 16503 : return false;
569 : :
570 : : /* First-order recurrence autovectorization needs shuffle vector. */
571 : 336 : tree scalar_type = TREE_TYPE (def);
572 : 336 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 : 336 : if (!vectype)
574 : : return false;
575 : :
576 : : return true;
577 : : }
578 : :
579 : : /* Function vect_analyze_scalar_cycles_1.
580 : :
581 : : Examine the cross iteration def-use cycles of scalar variables
582 : : in LOOP. LOOP_VINFO represents the loop that is now being
583 : : considered for vectorization (can be LOOP, or an outer-loop
584 : : enclosing LOOP). SLP indicates there will be some subsequent
585 : : slp analyses or not. */
586 : :
587 : : static void
588 : 303875 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 : : bool slp)
590 : : {
591 : 303875 : basic_block bb = loop->header;
592 : 303875 : tree init, step;
593 : 303875 : auto_vec<stmt_vec_info, 64> worklist;
594 : 303875 : gphi_iterator gsi;
595 : 303875 : bool double_reduc, reduc_chain;
596 : :
597 : 303875 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
598 : :
599 : : /* First - identify all inductions. Reduction detection assumes that all the
600 : : inductions have been identified, therefore, this order must not be
601 : : changed. */
602 : 1091837 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
603 : : {
604 : 787962 : gphi *phi = gsi.phi ();
605 : 787962 : tree access_fn = NULL;
606 : 787962 : tree def = PHI_RESULT (phi);
607 : 787962 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
608 : :
609 : 787962 : if (dump_enabled_p ())
610 : 55638 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 : : (gimple *) phi);
612 : :
613 : : /* Skip virtual phi's. The data dependences that are associated with
614 : : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 : 1575924 : if (virtual_operand_p (def))
616 : 259490 : continue;
617 : :
618 : 616944 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
619 : :
620 : : /* Analyze the evolution function. */
621 : 616944 : access_fn = analyze_scalar_evolution (loop, def);
622 : 616944 : if (access_fn)
623 : : {
624 : 616944 : STRIP_NOPS (access_fn);
625 : 616944 : if (dump_enabled_p ())
626 : 40812 : dump_printf_loc (MSG_NOTE, vect_location,
627 : : "Access function of PHI: %T\n", access_fn);
628 : 616944 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 : 616944 : = initial_condition_in_loop_num (access_fn, loop->num);
630 : 616944 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 : 616944 : = evolution_part_in_loop_num (access_fn, loop->num);
632 : : }
633 : :
634 : 705416 : if ((!access_fn
635 : 616944 : || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 : 616225 : || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 : : &init, &step)
638 : 522261 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 : 9491 : && TREE_CODE (step) != INTEGER_CST))
640 : : /* Only handle nonlinear iv for same loop. */
641 : 711633 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 : 91799 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 : : phi, &init, &step)))
644 : : {
645 : 88472 : worklist.safe_push (stmt_vinfo);
646 : 88472 : continue;
647 : : }
648 : :
649 : 528472 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 : : != NULL_TREE);
651 : 528472 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
652 : :
653 : 528472 : if (dump_enabled_p ())
654 : 35988 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 : 528472 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
656 : : }
657 : :
658 : :
659 : : /* Second - identify all reductions and nested cycles. */
660 : 392347 : while (worklist.length () > 0)
661 : : {
662 : 88472 : stmt_vec_info stmt_vinfo = worklist.pop ();
663 : 88472 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 : 88472 : tree def = PHI_RESULT (phi);
665 : :
666 : 88472 : if (dump_enabled_p ())
667 : 4824 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 : : (gimple *) phi);
669 : :
670 : 176944 : gcc_assert (!virtual_operand_p (def)
671 : : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
672 : :
673 : 88472 : stmt_vec_info reduc_stmt_info
674 : 88472 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 : 88472 : &reduc_chain, slp);
676 : 88472 : if (reduc_stmt_info)
677 : : {
678 : 67667 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 : 67667 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 : 67667 : if (double_reduc)
681 : : {
682 : 719 : if (dump_enabled_p ())
683 : 99 : dump_printf_loc (MSG_NOTE, vect_location,
684 : : "Detected double reduction.\n");
685 : :
686 : 719 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 : 719 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
688 : : /* Make it accessible for SLP vectorization. */
689 : 719 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
690 : : }
691 : : else
692 : : {
693 : 66948 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
694 : : {
695 : 2758 : if (dump_enabled_p ())
696 : 463 : dump_printf_loc (MSG_NOTE, vect_location,
697 : : "Detected vectorizable nested cycle.\n");
698 : :
699 : 2758 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
700 : : }
701 : : else
702 : : {
703 : 64190 : if (dump_enabled_p ())
704 : 3805 : dump_printf_loc (MSG_NOTE, vect_location,
705 : : "Detected reduction.\n");
706 : :
707 : 64190 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
708 : 64190 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
709 : : /* Store the reduction cycles for possible vectorization in
710 : : loop-aware SLP if it was not detected as reduction
711 : : chain. */
712 : 64190 : if (! reduc_chain)
713 : 63391 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
714 : 63391 : (reduc_stmt_info);
715 : : }
716 : : }
717 : : }
718 : 20805 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
719 : 330 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
720 : : else
721 : 20475 : if (dump_enabled_p ())
722 : 406 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
723 : : "Unknown def-use cycle pattern.\n");
724 : : }
725 : 303875 : }
726 : :
727 : :
728 : : /* Function vect_analyze_scalar_cycles.
729 : :
730 : : Examine the cross iteration def-use cycles of scalar variables, by
731 : : analyzing the loop-header PHIs of scalar variables. Classify each
732 : : cycle as one of the following: invariant, induction, reduction, unknown.
733 : : We do that for the loop represented by LOOP_VINFO, and also to its
734 : : inner-loop, if exists.
735 : : Examples for scalar cycles:
736 : :
737 : : Example1: reduction:
738 : :
739 : : loop1:
740 : : for (i=0; i<N; i++)
741 : : sum += a[i];
742 : :
743 : : Example2: induction:
744 : :
745 : : loop2:
746 : : for (i=0; i<N; i++)
747 : : a[i] = i; */
748 : :
749 : : static void
750 : 299061 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
751 : : {
752 : 299061 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
753 : :
754 : 299061 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
755 : :
756 : : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
757 : : Reductions in such inner-loop therefore have different properties than
758 : : the reductions in the nest that gets vectorized:
759 : : 1. When vectorized, they are executed in the same order as in the original
760 : : scalar loop, so we can't change the order of computation when
761 : : vectorizing them.
762 : : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
763 : : current checks are too strict. */
764 : :
765 : 299061 : if (loop->inner)
766 : 4814 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
767 : 299061 : }
768 : :
769 : : /* Transfer group and reduction information from STMT_INFO to its
770 : : pattern stmt. */
771 : :
772 : : static void
773 : 29 : vect_fixup_reduc_chain (stmt_vec_info stmt_info)
774 : : {
775 : 29 : stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
776 : 29 : stmt_vec_info stmtp;
777 : 29 : gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
778 : : && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
779 : 29 : REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
780 : 246 : do
781 : : {
782 : 246 : stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
783 : 246 : gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
784 : : == STMT_VINFO_DEF_TYPE (stmt_info));
785 : 246 : REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
786 : 246 : stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
787 : 246 : if (stmt_info)
788 : 217 : REDUC_GROUP_NEXT_ELEMENT (stmtp)
789 : 217 : = STMT_VINFO_RELATED_STMT (stmt_info);
790 : : }
791 : 246 : while (stmt_info);
792 : 29 : }
793 : :
794 : : /* Fixup scalar cycles that now have their stmts detected as patterns. */
795 : :
796 : : static void
797 : 299061 : vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
798 : : {
799 : 299061 : stmt_vec_info first;
800 : 299061 : unsigned i;
801 : :
802 : 299860 : FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
803 : : {
804 : 799 : stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
805 : 3542 : while (next)
806 : : {
807 : 2771 : if ((STMT_VINFO_IN_PATTERN_P (next)
808 : 2771 : != STMT_VINFO_IN_PATTERN_P (first))
809 : 5514 : || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
810 : : break;
811 : 2743 : next = REDUC_GROUP_NEXT_ELEMENT (next);
812 : : }
813 : : /* If all reduction chain members are well-formed patterns adjust
814 : : the group to group the pattern stmts instead. */
815 : 799 : if (! next
816 : 828 : && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
817 : : {
818 : 771 : if (STMT_VINFO_IN_PATTERN_P (first))
819 : : {
820 : 29 : vect_fixup_reduc_chain (first);
821 : 58 : LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
822 : 29 : = STMT_VINFO_RELATED_STMT (first);
823 : : }
824 : : }
825 : : /* If not all stmt in the chain are patterns or if we failed
826 : : to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
827 : : it as regular reduction instead. */
828 : : else
829 : : {
830 : : stmt_vec_info vinfo = first;
831 : : stmt_vec_info last = NULL;
832 : 117 : while (vinfo)
833 : : {
834 : 89 : next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
835 : 89 : REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
836 : 89 : REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
837 : 89 : last = vinfo;
838 : 89 : vinfo = next;
839 : : }
840 : 28 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
841 : 28 : = vect_internal_def;
842 : 31 : loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
843 : 28 : LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
844 : 28 : --i;
845 : : }
846 : : }
847 : 299061 : }
848 : :
849 : : /* Function vect_get_loop_niters.
850 : :
851 : : Determine how many iterations the loop is executed and place it
852 : : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
853 : : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
854 : : niter information holds in ASSUMPTIONS.
855 : :
856 : : Return the loop exit conditions. */
857 : :
858 : :
859 : : static vec<gcond *>
860 : 240389 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
861 : : tree *number_of_iterations, tree *number_of_iterationsm1)
862 : : {
863 : 240389 : auto_vec<edge> exits = get_loop_exit_edges (loop);
864 : 240389 : vec<gcond *> conds;
865 : 480778 : conds.create (exits.length ());
866 : 240389 : class tree_niter_desc niter_desc;
867 : 240389 : tree niter_assumptions, niter, may_be_zero;
868 : :
869 : 240389 : *assumptions = boolean_true_node;
870 : 240389 : *number_of_iterationsm1 = chrec_dont_know;
871 : 240389 : *number_of_iterations = chrec_dont_know;
872 : :
873 : 240389 : DUMP_VECT_SCOPE ("get_loop_niters");
874 : :
875 : 240389 : if (exits.is_empty ())
876 : 0 : return conds;
877 : :
878 : 240389 : if (dump_enabled_p ())
879 : 13362 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
880 : : exits.length ());
881 : :
882 : : edge exit;
883 : : unsigned int i;
884 : 576650 : FOR_EACH_VEC_ELT (exits, i, exit)
885 : : {
886 : 336261 : gcond *cond = get_loop_exit_condition (exit);
887 : 336261 : if (cond)
888 : 326986 : conds.safe_push (cond);
889 : :
890 : 336261 : if (dump_enabled_p ())
891 : 14288 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
892 : :
893 : 336261 : if (exit != main_exit)
894 : 131620 : continue;
895 : :
896 : 240389 : may_be_zero = NULL_TREE;
897 : 240389 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
898 : 240389 : || chrec_contains_undetermined (niter_desc.niter))
899 : 35748 : continue;
900 : :
901 : 204641 : niter_assumptions = niter_desc.assumptions;
902 : 204641 : may_be_zero = niter_desc.may_be_zero;
903 : 204641 : niter = niter_desc.niter;
904 : :
905 : 204641 : if (may_be_zero && integer_zerop (may_be_zero))
906 : : may_be_zero = NULL_TREE;
907 : :
908 : 12342 : if (may_be_zero)
909 : : {
910 : 12342 : if (COMPARISON_CLASS_P (may_be_zero))
911 : : {
912 : : /* Try to combine may_be_zero with assumptions, this can simplify
913 : : computation of niter expression. */
914 : 12342 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
915 : 1082 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
916 : : niter_assumptions,
917 : : fold_build1 (TRUTH_NOT_EXPR,
918 : : boolean_type_node,
919 : : may_be_zero));
920 : : else
921 : 11260 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
922 : : build_int_cst (TREE_TYPE (niter), 0),
923 : : rewrite_to_non_trapping_overflow (niter));
924 : :
925 : 204641 : may_be_zero = NULL_TREE;
926 : : }
927 : 0 : else if (integer_nonzerop (may_be_zero))
928 : : {
929 : 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
930 : 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
931 : 0 : continue;
932 : : }
933 : : else
934 : 0 : continue;
935 : : }
936 : :
937 : : /* Loop assumptions are based off the normal exit. */
938 : 204641 : *assumptions = niter_assumptions;
939 : 204641 : *number_of_iterationsm1 = niter;
940 : :
941 : : /* We want the number of loop header executions which is the number
942 : : of latch executions plus one.
943 : : ??? For UINT_MAX latch executions this number overflows to zero
944 : : for loops like do { n++; } while (n != 0); */
945 : 204641 : if (niter && !chrec_contains_undetermined (niter))
946 : : {
947 : 204641 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
948 : : unshare_expr (niter),
949 : : build_int_cst (TREE_TYPE (niter), 1));
950 : 204641 : if (TREE_CODE (niter) == INTEGER_CST
951 : 111165 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
952 : : {
953 : : /* If we manage to fold niter + 1 into INTEGER_CST even when
954 : : niter is some complex expression, ensure back
955 : : *number_of_iterationsm1 is an INTEGER_CST as well. See
956 : : PR113210. */
957 : 4 : *number_of_iterationsm1
958 : 4 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
959 : : build_minus_one_cst (TREE_TYPE (niter)));
960 : : }
961 : : }
962 : 204641 : *number_of_iterations = niter;
963 : : }
964 : :
965 : 240389 : if (dump_enabled_p ())
966 : 13362 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
967 : :
968 : 240389 : return conds;
969 : 240389 : }
970 : :
971 : : /* Determine the main loop exit for the vectorizer. */
972 : :
973 : : edge
974 : 452067 : vec_init_loop_exit_info (class loop *loop)
975 : : {
976 : : /* Before we begin we must first determine which exit is the main one and
977 : : which are auxilary exits. */
978 : 452067 : auto_vec<edge> exits = get_loop_exit_edges (loop);
979 : 452067 : if (exits.length () == 1)
980 : 291492 : return exits[0];
981 : :
982 : : /* If we have multiple exits we only support counting IV at the moment.
983 : : Analyze all exits and return the last one we can analyze. */
984 : 160575 : class tree_niter_desc niter_desc;
985 : 160575 : edge candidate = NULL;
986 : 1093514 : for (edge exit : exits)
987 : : {
988 : 621417 : if (!get_loop_exit_condition (exit))
989 : 168458 : continue;
990 : :
991 : 452959 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
992 : 452959 : && !chrec_contains_undetermined (niter_desc.niter))
993 : : {
994 : 122409 : tree may_be_zero = niter_desc.may_be_zero;
995 : 122409 : if ((integer_zerop (may_be_zero)
996 : : /* As we are handling may_be_zero that's not false by
997 : : rewriting niter to may_be_zero ? 0 : niter we require
998 : : an empty latch. */
999 : 633738 : || (single_pred_p (loop->latch)
1000 : 11957 : && exit->src == single_pred (loop->latch)
1001 : 4078 : && (integer_nonzerop (may_be_zero)
1002 : 4078 : || COMPARISON_CLASS_P (may_be_zero))))
1003 : 126487 : && (!candidate
1004 : 4782 : || dominated_by_p (CDI_DOMINATORS, exit->src,
1005 : 4782 : candidate->src)))
1006 : : candidate = exit;
1007 : : }
1008 : : }
1009 : :
1010 : 160575 : return candidate;
1011 : 160575 : }
1012 : :
1013 : : /* Function bb_in_loop_p
1014 : :
1015 : : Used as predicate for dfs order traversal of the loop bbs. */
1016 : :
1017 : : static bool
1018 : 1210345 : bb_in_loop_p (const_basic_block bb, const void *data)
1019 : : {
1020 : 1210345 : const class loop *const loop = (const class loop *)data;
1021 : 1210345 : if (flow_bb_inside_loop_p (loop, bb))
1022 : : return true;
1023 : : return false;
1024 : : }
1025 : :
1026 : :
1027 : : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1028 : : stmt_vec_info structs for all the stmts in LOOP_IN. */
1029 : :
1030 : 390202 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1031 : : : vec_info (vec_info::loop, shared),
1032 : 390202 : loop (loop_in),
1033 : 390202 : num_itersm1 (NULL_TREE),
1034 : 390202 : num_iters (NULL_TREE),
1035 : 390202 : num_iters_unchanged (NULL_TREE),
1036 : 390202 : num_iters_assumptions (NULL_TREE),
1037 : 390202 : vector_costs (nullptr),
1038 : 390202 : scalar_costs (nullptr),
1039 : 390202 : th (0),
1040 : 390202 : versioning_threshold (0),
1041 : 390202 : vectorization_factor (0),
1042 : 390202 : main_loop_edge (nullptr),
1043 : 390202 : skip_main_loop_edge (nullptr),
1044 : 390202 : skip_this_loop_edge (nullptr),
1045 : 390202 : reusable_accumulators (),
1046 : 390202 : suggested_unroll_factor (1),
1047 : 390202 : max_vectorization_factor (0),
1048 : 390202 : mask_skip_niters (NULL_TREE),
1049 : 390202 : rgroup_compare_type (NULL_TREE),
1050 : 390202 : simd_if_cond (NULL_TREE),
1051 : 390202 : partial_vector_style (vect_partial_vectors_none),
1052 : 390202 : unaligned_dr (NULL),
1053 : 390202 : peeling_for_alignment (0),
1054 : 390202 : ptr_mask (0),
1055 : 390202 : ivexpr_map (NULL),
1056 : 390202 : scan_map (NULL),
1057 : 390202 : slp_unrolling_factor (1),
1058 : 390202 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1059 : 390202 : vectorizable (false),
1060 : 390202 : can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1061 : 390202 : must_use_partial_vectors_p (false),
1062 : 390202 : using_partial_vectors_p (false),
1063 : 390202 : using_decrementing_iv_p (false),
1064 : 390202 : using_select_vl_p (false),
1065 : 390202 : epil_using_partial_vectors_p (false),
1066 : 390202 : partial_load_store_bias (0),
1067 : 390202 : peeling_for_gaps (false),
1068 : 390202 : peeling_for_niter (false),
1069 : 390202 : early_breaks (false),
1070 : 390202 : no_data_dependencies (false),
1071 : 390202 : has_mask_store (false),
1072 : 390202 : scalar_loop_scaling (profile_probability::uninitialized ()),
1073 : 390202 : scalar_loop (NULL),
1074 : 390202 : main_loop_info (NULL),
1075 : 390202 : orig_loop_info (NULL),
1076 : 390202 : epilogue_vinfo (NULL),
1077 : 390202 : drs_advanced_by (NULL_TREE),
1078 : 390202 : vec_loop_iv_exit (NULL),
1079 : 390202 : vec_epilogue_loop_iv_exit (NULL),
1080 : 780404 : scalar_loop_iv_exit (NULL)
1081 : : {
1082 : : /* CHECKME: We want to visit all BBs before their successors (except for
1083 : : latch blocks, for which this assertion wouldn't hold). In the simple
1084 : : case of the loop forms we allow, a dfs order of the BBs would the same
1085 : : as reversed postorder traversal, so we are safe. */
1086 : :
1087 : 390202 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
1088 : 780404 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
1089 : 390202 : loop->num_nodes, loop);
1090 : 390202 : gcc_assert (nbbs == loop->num_nodes);
1091 : :
1092 : 1397930 : for (unsigned int i = 0; i < nbbs; i++)
1093 : : {
1094 : 1007728 : basic_block bb = bbs[i];
1095 : 1007728 : gimple_stmt_iterator si;
1096 : :
1097 : 2063639 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1098 : : {
1099 : 1055911 : gimple *phi = gsi_stmt (si);
1100 : 1055911 : gimple_set_uid (phi, 0);
1101 : 1055911 : add_stmt (phi);
1102 : : }
1103 : :
1104 : 8368284 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1105 : : {
1106 : 6352828 : gimple *stmt = gsi_stmt (si);
1107 : 6352828 : gimple_set_uid (stmt, 0);
1108 : 6352828 : if (is_gimple_debug (stmt))
1109 : 2079109 : continue;
1110 : 4273719 : add_stmt (stmt);
1111 : : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1112 : : third argument is the #pragma omp simd if (x) condition, when 0,
1113 : : loop shouldn't be vectorized, when non-zero constant, it should
1114 : : be vectorized normally, otherwise versioned with vectorized loop
1115 : : done if the condition is non-zero at runtime. */
1116 : 4273719 : if (loop_in->simduid
1117 : 43889 : && is_gimple_call (stmt)
1118 : 4294 : && gimple_call_internal_p (stmt)
1119 : 4155 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1120 : 4154 : && gimple_call_num_args (stmt) >= 3
1121 : 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1122 : 4273822 : && (loop_in->simduid
1123 : 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1124 : : {
1125 : 103 : tree arg = gimple_call_arg (stmt, 2);
1126 : 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1127 : 103 : simd_if_cond = arg;
1128 : : else
1129 : 0 : gcc_assert (integer_nonzerop (arg));
1130 : : }
1131 : : }
1132 : : }
1133 : 390202 : }
1134 : :
1135 : : /* Free all levels of rgroup CONTROLS. */
1136 : :
1137 : : void
1138 : 1217324 : release_vec_loop_controls (vec<rgroup_controls> *controls)
1139 : : {
1140 : 1217324 : rgroup_controls *rgc;
1141 : 1217324 : unsigned int i;
1142 : 1217349 : FOR_EACH_VEC_ELT (*controls, i, rgc)
1143 : 25 : rgc->controls.release ();
1144 : 1217324 : controls->release ();
1145 : 1217324 : }
1146 : :
1147 : : /* Free all memory used by the _loop_vec_info, as well as all the
1148 : : stmt_vec_info structs of all the stmts in the loop. */
1149 : :
1150 : 390202 : _loop_vec_info::~_loop_vec_info ()
1151 : : {
1152 : 390202 : free (bbs);
1153 : :
1154 : 390202 : release_vec_loop_controls (&masks.rgc_vec);
1155 : 390202 : release_vec_loop_controls (&lens);
1156 : 393367 : delete ivexpr_map;
1157 : 390524 : delete scan_map;
1158 : 390202 : delete scalar_costs;
1159 : 390202 : delete vector_costs;
1160 : :
1161 : : /* When we release an epiloge vinfo that we do not intend to use
1162 : : avoid clearing AUX of the main loop which should continue to
1163 : : point to the main loop vinfo since otherwise we'll leak that. */
1164 : 390202 : if (loop->aux == this)
1165 : 55135 : loop->aux = NULL;
1166 : 780404 : }
1167 : :
1168 : : /* Return an invariant or register for EXPR and emit necessary
1169 : : computations in the LOOP_VINFO loop preheader. */
1170 : :
1171 : : tree
1172 : 17538 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1173 : : {
1174 : 17538 : if (is_gimple_reg (expr)
1175 : 17538 : || is_gimple_min_invariant (expr))
1176 : 5486 : return expr;
1177 : :
1178 : 12052 : if (! loop_vinfo->ivexpr_map)
1179 : 3165 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1180 : 12052 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1181 : 12052 : if (! cached)
1182 : : {
1183 : 7685 : gimple_seq stmts = NULL;
1184 : 7685 : cached = force_gimple_operand (unshare_expr (expr),
1185 : : &stmts, true, NULL_TREE);
1186 : 7685 : if (stmts)
1187 : : {
1188 : 7543 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1189 : 7543 : gsi_insert_seq_on_edge_immediate (e, stmts);
1190 : : }
1191 : : }
1192 : 12052 : return cached;
1193 : : }
1194 : :
1195 : : /* Return true if we can use CMP_TYPE as the comparison type to produce
1196 : : all masks required to mask LOOP_VINFO. */
1197 : :
1198 : : static bool
1199 : 91 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1200 : : {
1201 : 91 : rgroup_controls *rgm;
1202 : 91 : unsigned int i;
1203 : 104 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1204 : 104 : if (rgm->type != NULL_TREE
1205 : 104 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1206 : : cmp_type, rgm->type,
1207 : : OPTIMIZE_FOR_SPEED))
1208 : : return false;
1209 : : return true;
1210 : : }
1211 : :
1212 : : /* Calculate the maximum number of scalars per iteration for every
1213 : : rgroup in LOOP_VINFO. */
1214 : :
1215 : : static unsigned int
1216 : 21 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1217 : : {
1218 : 21 : unsigned int res = 1;
1219 : 21 : unsigned int i;
1220 : 21 : rgroup_controls *rgm;
1221 : 51 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1222 : 30 : res = MAX (res, rgm->max_nscalars_per_iter);
1223 : 21 : return res;
1224 : : }
1225 : :
1226 : : /* Calculate the minimum precision necessary to represent:
1227 : :
1228 : : MAX_NITERS * FACTOR
1229 : :
1230 : : as an unsigned integer, where MAX_NITERS is the maximum number of
1231 : : loop header iterations for the original scalar form of LOOP_VINFO. */
1232 : :
1233 : : static unsigned
1234 : 21 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1235 : : {
1236 : 21 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1237 : :
1238 : : /* Get the maximum number of iterations that is representable
1239 : : in the counter type. */
1240 : 21 : tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1241 : 21 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1242 : :
1243 : : /* Get a more refined estimate for the number of iterations. */
1244 : 21 : widest_int max_back_edges;
1245 : 21 : if (max_loop_iterations (loop, &max_back_edges))
1246 : 21 : max_ni = wi::smin (max_ni, max_back_edges + 1);
1247 : :
1248 : : /* Work out how many bits we need to represent the limit. */
1249 : 21 : return wi::min_precision (max_ni * factor, UNSIGNED);
1250 : 21 : }
1251 : :
1252 : : /* True if the loop needs peeling or partial vectors when vectorized. */
1253 : :
1254 : : static bool
1255 : 118096 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1256 : : {
1257 : 118096 : unsigned HOST_WIDE_INT const_vf;
1258 : 118096 : HOST_WIDE_INT max_niter
1259 : 118096 : = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1260 : :
1261 : 118096 : unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1262 : 118096 : if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1263 : 25179 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1264 : : (loop_vinfo));
1265 : :
1266 : 118096 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1267 : 52398 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1268 : : {
1269 : : /* Work out the (constant) number of iterations that need to be
1270 : : peeled for reasons other than niters. */
1271 : 52384 : unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1272 : 52384 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1273 : 514 : peel_niter += 1;
1274 : 117091 : if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1275 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1276 : : return true;
1277 : : }
1278 : 65712 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1279 : : /* ??? When peeling for gaps but not alignment, we could
1280 : : try to check whether the (variable) niters is known to be
1281 : : VF * N + 1. That's something of a niche case though. */
1282 : 65554 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1283 : 64598 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1284 : 130310 : || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1285 : 129196 : < (unsigned) exact_log2 (const_vf))
1286 : : /* In case of versioning, check if the maximum number of
1287 : : iterations is greater than th. If they are identical,
1288 : : the epilogue is unnecessary. */
1289 : 63611 : && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1290 : 4218 : || ((unsigned HOST_WIDE_INT) max_niter
1291 : : /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1292 : : but that's only computed later based on our result.
1293 : : The following is the most conservative approximation. */
1294 : 4218 : > (std::max ((unsigned HOST_WIDE_INT) th,
1295 : 4218 : const_vf) / const_vf) * const_vf))))
1296 : 64707 : return true;
1297 : :
1298 : : return false;
1299 : : }
1300 : :
1301 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1302 : : whether we can actually generate the masks required. Return true if so,
1303 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1304 : :
1305 : : static bool
1306 : 21 : vect_verify_full_masking (loop_vec_info loop_vinfo)
1307 : : {
1308 : 21 : unsigned int min_ni_width;
1309 : :
1310 : : /* Use a normal loop if there are no statements that need masking.
1311 : : This only happens in rare degenerate cases: it means that the loop
1312 : : has no loads, no stores, and no live-out values. */
1313 : 21 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1314 : : return false;
1315 : :
1316 : : /* Produce the rgroup controls. */
1317 : 67 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1318 : : {
1319 : 23 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1320 : 23 : tree vectype = mask.first;
1321 : 23 : unsigned nvectors = mask.second;
1322 : :
1323 : 25 : if (masks->rgc_vec.length () < nvectors)
1324 : 22 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1325 : 23 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1326 : : /* The number of scalars per iteration and the number of vectors are
1327 : : both compile-time constants. */
1328 : 23 : unsigned int nscalars_per_iter
1329 : 23 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1330 : 23 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1331 : :
1332 : 23 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1333 : : {
1334 : 23 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1335 : 23 : rgm->type = truth_type_for (vectype);
1336 : 23 : rgm->factor = 1;
1337 : : }
1338 : : }
1339 : :
1340 : 21 : unsigned int max_nscalars_per_iter
1341 : 21 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1342 : :
1343 : : /* Work out how many bits we need to represent the limit. */
1344 : 21 : min_ni_width
1345 : 21 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1346 : :
1347 : : /* Find a scalar mode for which WHILE_ULT is supported. */
1348 : 21 : opt_scalar_int_mode cmp_mode_iter;
1349 : 21 : tree cmp_type = NULL_TREE;
1350 : 21 : tree iv_type = NULL_TREE;
1351 : 21 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1352 : 21 : unsigned int iv_precision = UINT_MAX;
1353 : :
1354 : 21 : if (iv_limit != -1)
1355 : 21 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1356 : : UNSIGNED);
1357 : :
1358 : 168 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1359 : : {
1360 : 147 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1361 : 147 : if (cmp_bits >= min_ni_width
1362 : 147 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1363 : : {
1364 : 91 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1365 : 91 : if (this_type
1366 : 91 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1367 : : {
1368 : : /* Although we could stop as soon as we find a valid mode,
1369 : : there are at least two reasons why that's not always the
1370 : : best choice:
1371 : :
1372 : : - An IV that's Pmode or wider is more likely to be reusable
1373 : : in address calculations than an IV that's narrower than
1374 : : Pmode.
1375 : :
1376 : : - Doing the comparison in IV_PRECISION or wider allows
1377 : : a natural 0-based IV, whereas using a narrower comparison
1378 : : type requires mitigations against wrap-around.
1379 : :
1380 : : Conversely, if the IV limit is variable, doing the comparison
1381 : : in a wider type than the original type can introduce
1382 : : unnecessary extensions, so picking the widest valid mode
1383 : : is not always a good choice either.
1384 : :
1385 : : Here we prefer the first IV type that's Pmode or wider,
1386 : : and the first comparison type that's IV_PRECISION or wider.
1387 : : (The comparison type must be no wider than the IV type,
1388 : : to avoid extensions in the vector loop.)
1389 : :
1390 : : ??? We might want to try continuing beyond Pmode for ILP32
1391 : : targets if CMP_BITS < IV_PRECISION. */
1392 : 0 : iv_type = this_type;
1393 : 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1394 : : cmp_type = this_type;
1395 : 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1396 : : break;
1397 : : }
1398 : : }
1399 : : }
1400 : :
1401 : 21 : if (!cmp_type)
1402 : : {
1403 : 21 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1404 : 21 : return false;
1405 : : }
1406 : :
1407 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1408 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1409 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1410 : 0 : return true;
1411 : 21 : }
1412 : :
1413 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1414 : : whether we can actually generate AVX512 style masks. Return true if so,
1415 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1416 : :
1417 : : static bool
1418 : 21 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1419 : : {
1420 : : /* Produce differently organized rgc_vec and differently check
1421 : : we can produce masks. */
1422 : :
1423 : : /* Use a normal loop if there are no statements that need masking.
1424 : : This only happens in rare degenerate cases: it means that the loop
1425 : : has no loads, no stores, and no live-out values. */
1426 : 21 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1427 : : return false;
1428 : :
1429 : : /* For the decrementing IV we need to represent all values in
1430 : : [0, niter + niter_skip] where niter_skip is the elements we
1431 : : skip in the first iteration for prologue peeling. */
1432 : 21 : tree iv_type = NULL_TREE;
1433 : 21 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1434 : 21 : unsigned int iv_precision = UINT_MAX;
1435 : 21 : if (iv_limit != -1)
1436 : 21 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1437 : :
1438 : : /* First compute the type for the IV we use to track the remaining
1439 : : scalar iterations. */
1440 : 21 : opt_scalar_int_mode cmp_mode_iter;
1441 : 35 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1442 : : {
1443 : 35 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1444 : 35 : if (cmp_bits >= iv_precision
1445 : 35 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1446 : : {
1447 : 21 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1448 : 21 : if (iv_type)
1449 : : break;
1450 : : }
1451 : : }
1452 : 21 : if (!iv_type)
1453 : : return false;
1454 : :
1455 : : /* Produce the rgroup controls. */
1456 : 67 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1457 : : {
1458 : 23 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1459 : 23 : tree vectype = mask.first;
1460 : 23 : unsigned nvectors = mask.second;
1461 : :
1462 : : /* The number of scalars per iteration and the number of vectors are
1463 : : both compile-time constants. */
1464 : 23 : unsigned int nscalars_per_iter
1465 : 23 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1466 : 23 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1467 : :
1468 : : /* We index the rgroup_controls vector with nscalars_per_iter
1469 : : which we keep constant and instead have a varying nvectors,
1470 : : remembering the vector mask with the fewest nV. */
1471 : 25 : if (masks->rgc_vec.length () < nscalars_per_iter)
1472 : 21 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1473 : 23 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1474 : :
1475 : 23 : if (!rgm->type || rgm->factor > nvectors)
1476 : : {
1477 : 22 : rgm->type = truth_type_for (vectype);
1478 : 22 : rgm->compare_type = NULL_TREE;
1479 : 22 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1480 : 22 : rgm->factor = nvectors;
1481 : 22 : rgm->bias_adjusted_ctrl = NULL_TREE;
1482 : : }
1483 : : }
1484 : :
1485 : : /* There is no fixed compare type we are going to use but we have to
1486 : : be able to get at one for each mask group. */
1487 : 21 : unsigned int min_ni_width
1488 : 21 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1489 : :
1490 : 21 : bool ok = true;
1491 : 86 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1492 : : {
1493 : 25 : tree mask_type = rgc.type;
1494 : 25 : if (!mask_type)
1495 : 4 : continue;
1496 : :
1497 : : /* For now vect_get_loop_mask only supports integer mode masks
1498 : : when we need to split it. */
1499 : 21 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1500 : 21 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1501 : : {
1502 : : ok = false;
1503 : : break;
1504 : : }
1505 : :
1506 : : /* If iv_type is usable as compare type use that - we can elide the
1507 : : saturation in that case. */
1508 : 19 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1509 : : {
1510 : 19 : tree cmp_vectype
1511 : 19 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1512 : 19 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1513 : 1 : rgc.compare_type = cmp_vectype;
1514 : : }
1515 : 19 : if (!rgc.compare_type)
1516 : 51 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1517 : : {
1518 : 51 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1519 : 51 : if (cmp_bits >= min_ni_width
1520 : 51 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1521 : : {
1522 : 51 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1523 : 51 : if (!cmp_type)
1524 : 0 : continue;
1525 : :
1526 : : /* Check whether we can produce the mask with cmp_type. */
1527 : 51 : tree cmp_vectype
1528 : 51 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1529 : 51 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1530 : : {
1531 : 18 : rgc.compare_type = cmp_vectype;
1532 : 18 : break;
1533 : : }
1534 : : }
1535 : : }
1536 : 19 : if (!rgc.compare_type)
1537 : : {
1538 : : ok = false;
1539 : : break;
1540 : : }
1541 : : }
1542 : 21 : if (!ok)
1543 : : {
1544 : 2 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1545 : 2 : return false;
1546 : : }
1547 : :
1548 : 19 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1549 : 19 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1550 : 19 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1551 : 19 : return true;
1552 : 21 : }
1553 : :
1554 : : /* Check whether we can use vector access with length based on precison
1555 : : comparison. So far, to keep it simple, we only allow the case that the
1556 : : precision of the target supported length is larger than the precision
1557 : : required by loop niters. */
1558 : :
1559 : : static bool
1560 : 0 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1561 : : {
1562 : 0 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1563 : : return false;
1564 : :
1565 : 0 : machine_mode len_load_mode, len_store_mode;
1566 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1567 : 0 : .exists (&len_load_mode))
1568 : 0 : return false;
1569 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1570 : 0 : .exists (&len_store_mode))
1571 : 0 : return false;
1572 : :
1573 : 0 : signed char partial_load_bias = internal_len_load_store_bias
1574 : 0 : (IFN_LEN_LOAD, len_load_mode);
1575 : :
1576 : 0 : signed char partial_store_bias = internal_len_load_store_bias
1577 : 0 : (IFN_LEN_STORE, len_store_mode);
1578 : :
1579 : 0 : gcc_assert (partial_load_bias == partial_store_bias);
1580 : :
1581 : 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1582 : : return false;
1583 : :
1584 : : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1585 : : len_loads with a length of zero. In order to avoid that we prohibit
1586 : : more than one loop length here. */
1587 : 0 : if (partial_load_bias == -1
1588 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1589 : : return false;
1590 : :
1591 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1592 : :
1593 : 0 : unsigned int max_nitems_per_iter = 1;
1594 : 0 : unsigned int i;
1595 : 0 : rgroup_controls *rgl;
1596 : : /* Find the maximum number of items per iteration for every rgroup. */
1597 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1598 : : {
1599 : 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1600 : 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1601 : : }
1602 : :
1603 : : /* Work out how many bits we need to represent the length limit. */
1604 : 0 : unsigned int min_ni_prec
1605 : 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1606 : :
1607 : : /* Now use the maximum of below precisions for one suitable IV type:
1608 : : - the IV's natural precision
1609 : : - the precision needed to hold: the maximum number of scalar
1610 : : iterations multiplied by the scale factor (min_ni_prec above)
1611 : : - the Pmode precision
1612 : :
1613 : : If min_ni_prec is less than the precision of the current niters,
1614 : : we perfer to still use the niters type. Prefer to use Pmode and
1615 : : wider IV to avoid narrow conversions. */
1616 : :
1617 : 0 : unsigned int ni_prec
1618 : 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1619 : 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1620 : 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1621 : :
1622 : 0 : tree iv_type = NULL_TREE;
1623 : 0 : opt_scalar_int_mode tmode_iter;
1624 : 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1625 : : {
1626 : 0 : scalar_mode tmode = tmode_iter.require ();
1627 : 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1628 : :
1629 : : /* ??? Do we really want to construct one IV whose precision exceeds
1630 : : BITS_PER_WORD? */
1631 : 0 : if (tbits > BITS_PER_WORD)
1632 : : break;
1633 : :
1634 : : /* Find the first available standard integral type. */
1635 : 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1636 : : {
1637 : 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1638 : 0 : break;
1639 : : }
1640 : : }
1641 : :
1642 : 0 : if (!iv_type)
1643 : : {
1644 : 0 : if (dump_enabled_p ())
1645 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1646 : : "can't vectorize with length-based partial vectors"
1647 : : " because there is no suitable iv type.\n");
1648 : 0 : return false;
1649 : : }
1650 : :
1651 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1652 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1653 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1654 : :
1655 : 0 : return true;
1656 : : }
1657 : :
1658 : : /* Calculate the cost of one scalar iteration of the loop. */
1659 : : static void
1660 : 212341 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1661 : : {
1662 : 212341 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663 : 212341 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1664 : 212341 : int nbbs = loop->num_nodes, factor;
1665 : 212341 : int innerloop_iters, i;
1666 : :
1667 : 212341 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1668 : :
1669 : : /* Gather costs for statements in the scalar loop. */
1670 : :
1671 : : /* FORNOW. */
1672 : 212341 : innerloop_iters = 1;
1673 : 212341 : if (loop->inner)
1674 : 1164 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1675 : :
1676 : 735087 : for (i = 0; i < nbbs; i++)
1677 : : {
1678 : 522746 : gimple_stmt_iterator si;
1679 : 522746 : basic_block bb = bbs[i];
1680 : :
1681 : 522746 : if (bb->loop_father == loop->inner)
1682 : : factor = innerloop_iters;
1683 : : else
1684 : 520418 : factor = 1;
1685 : :
1686 : 4053856 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1687 : : {
1688 : 3008364 : gimple *stmt = gsi_stmt (si);
1689 : 3008364 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1690 : :
1691 : 3008364 : if (!is_gimple_assign (stmt)
1692 : : && !is_gimple_call (stmt)
1693 : : && !is_a<gcond *> (stmt))
1694 : 801608 : continue;
1695 : :
1696 : : /* Skip stmts that are not vectorized inside the loop. */
1697 : 2206756 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1698 : 2206756 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1699 : 974866 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1700 : 65 : || !VECTORIZABLE_CYCLE_DEF
1701 : : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1702 : 974866 : continue;
1703 : :
1704 : 1231890 : vect_cost_for_stmt kind;
1705 : 1231890 : if (STMT_VINFO_DATA_REF (stmt_info))
1706 : : {
1707 : 535161 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1708 : : kind = scalar_load;
1709 : : else
1710 : 199387 : kind = scalar_store;
1711 : : }
1712 : 696729 : else if (vect_nop_conversion_p (stmt_info))
1713 : 32783 : continue;
1714 : : else
1715 : : kind = scalar_stmt;
1716 : :
1717 : : /* We are using vect_prologue here to avoid scaling twice
1718 : : by the inner loop factor. */
1719 : 1199107 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1720 : : factor, kind, stmt_info, 0, vect_prologue);
1721 : : }
1722 : : }
1723 : :
1724 : : /* Now accumulate cost. */
1725 : 212341 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1726 : 212341 : add_stmt_costs (loop_vinfo->scalar_costs,
1727 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1728 : 212341 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1729 : 212341 : }
1730 : :
1731 : : /* Function vect_analyze_loop_form.
1732 : :
1733 : : Verify that certain CFG restrictions hold, including:
1734 : : - the loop has a pre-header
1735 : : - the loop has a single entry
1736 : : - nested loops can have only a single exit.
1737 : : - the loop exit condition is simple enough
1738 : : - the number of iterations can be analyzed, i.e, a countable loop. The
1739 : : niter could be analyzed under some assumptions. */
1740 : :
1741 : : opt_result
1742 : 423123 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1743 : : vect_loop_form_info *info)
1744 : : {
1745 : 423123 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1746 : :
1747 : 423123 : edge exit_e = vec_init_loop_exit_info (loop);
1748 : 423123 : if (!exit_e)
1749 : 51191 : return opt_result::failure_at (vect_location,
1750 : : "not vectorized:"
1751 : : " could not determine main exit from"
1752 : : " loop with multiple exits.\n");
1753 : 371932 : if (loop_vectorized_call)
1754 : : {
1755 : 23533 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1756 : 23533 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1757 : 23533 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1758 : 23533 : if (!scalar_exit_e)
1759 : 0 : return opt_result::failure_at (vect_location,
1760 : : "not vectorized:"
1761 : : " could not determine main exit from"
1762 : : " loop with multiple exits.\n");
1763 : : }
1764 : :
1765 : 371932 : info->loop_exit = exit_e;
1766 : 371932 : if (dump_enabled_p ())
1767 : 14640 : dump_printf_loc (MSG_NOTE, vect_location,
1768 : : "using as main loop exit: %d -> %d [AUX: %p]\n",
1769 : 14640 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1770 : :
1771 : : /* Check if we have any control flow that doesn't leave the loop. */
1772 : 371932 : basic_block *bbs = get_loop_body (loop);
1773 : 1247007 : for (unsigned i = 0; i < loop->num_nodes; i++)
1774 : 975691 : if (EDGE_COUNT (bbs[i]->succs) != 1
1775 : 975691 : && (EDGE_COUNT (bbs[i]->succs) != 2
1776 : 574830 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1777 : : {
1778 : 100616 : free (bbs);
1779 : 100616 : return opt_result::failure_at (vect_location,
1780 : : "not vectorized:"
1781 : : " unsupported control flow in loop.\n");
1782 : : }
1783 : 271316 : free (bbs);
1784 : :
1785 : : /* Different restrictions apply when we are considering an inner-most loop,
1786 : : vs. an outer (nested) loop.
1787 : : (FORNOW. May want to relax some of these restrictions in the future). */
1788 : :
1789 : 271316 : info->inner_loop_cond = NULL;
1790 : 271316 : if (!loop->inner)
1791 : : {
1792 : : /* Inner-most loop. */
1793 : :
1794 : 252098 : if (empty_block_p (loop->header))
1795 : 3 : return opt_result::failure_at (vect_location,
1796 : : "not vectorized: empty loop.\n");
1797 : : }
1798 : : else
1799 : : {
1800 : 19218 : class loop *innerloop = loop->inner;
1801 : 19218 : edge entryedge;
1802 : :
1803 : : /* Nested loop. We currently require that the loop is doubly-nested,
1804 : : contains a single inner loop with a single exit to the block
1805 : : with the single exit condition in the outer loop.
1806 : : Vectorizable outer-loops look like this:
1807 : :
1808 : : (pre-header)
1809 : : |
1810 : : header <---+
1811 : : | |
1812 : : inner-loop |
1813 : : | |
1814 : : tail ------+
1815 : : |
1816 : : (exit-bb)
1817 : :
1818 : : The inner-loop also has the properties expected of inner-most loops
1819 : : as described above. */
1820 : :
1821 : 19218 : if ((loop->inner)->inner || (loop->inner)->next)
1822 : 2924 : return opt_result::failure_at (vect_location,
1823 : : "not vectorized:"
1824 : : " multiple nested loops.\n");
1825 : :
1826 : 16294 : entryedge = loop_preheader_edge (innerloop);
1827 : 16294 : if (entryedge->src != loop->header
1828 : 15952 : || !single_exit (innerloop)
1829 : 27176 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1830 : 5695 : return opt_result::failure_at (vect_location,
1831 : : "not vectorized:"
1832 : : " unsupported outerloop form.\n");
1833 : :
1834 : : /* Analyze the inner-loop. */
1835 : 10599 : vect_loop_form_info inner;
1836 : 10599 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1837 : 10599 : if (!res)
1838 : : {
1839 : 1179 : if (dump_enabled_p ())
1840 : 5 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841 : : "not vectorized: Bad inner loop.\n");
1842 : 1179 : return res;
1843 : : }
1844 : :
1845 : : /* Don't support analyzing niter under assumptions for inner
1846 : : loop. */
1847 : 9420 : if (!integer_onep (inner.assumptions))
1848 : 283 : return opt_result::failure_at (vect_location,
1849 : : "not vectorized: Bad inner loop.\n");
1850 : :
1851 : 9137 : if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1852 : 1080 : return opt_result::failure_at (vect_location,
1853 : : "not vectorized: inner-loop count not"
1854 : : " invariant.\n");
1855 : :
1856 : 8057 : if (dump_enabled_p ())
1857 : 915 : dump_printf_loc (MSG_NOTE, vect_location,
1858 : : "Considering outer-loop vectorization.\n");
1859 : 8057 : info->inner_loop_cond = inner.conds[0];
1860 : 10599 : }
1861 : :
1862 : 260152 : if (EDGE_COUNT (loop->header->preds) != 2)
1863 : 0 : return opt_result::failure_at (vect_location,
1864 : : "not vectorized:"
1865 : : " too many incoming edges.\n");
1866 : :
1867 : : /* We assume that the latch is empty. */
1868 : 260152 : basic_block latch = loop->latch;
1869 : 260152 : do
1870 : : {
1871 : 260152 : if (!empty_block_p (latch)
1872 : 260152 : || !gimple_seq_empty_p (phi_nodes (latch)))
1873 : 19719 : return opt_result::failure_at (vect_location,
1874 : : "not vectorized: latch block not "
1875 : : "empty.\n");
1876 : 240433 : latch = single_pred (latch);
1877 : : }
1878 : 480866 : while (single_succ_p (latch));
1879 : :
1880 : : /* Make sure there is no abnormal exit. */
1881 : 240433 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1882 : 1057571 : for (edge e : exits)
1883 : : {
1884 : 336316 : if (e->flags & EDGE_ABNORMAL)
1885 : 44 : return opt_result::failure_at (vect_location,
1886 : : "not vectorized:"
1887 : : " abnormal loop exit edge.\n");
1888 : : }
1889 : :
1890 : 240389 : info->conds
1891 : 240389 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1892 : : &info->number_of_iterations,
1893 : 240389 : &info->number_of_iterationsm1);
1894 : 240389 : if (info->conds.is_empty ())
1895 : 32 : return opt_result::failure_at
1896 : 32 : (vect_location,
1897 : : "not vectorized: complicated exit condition.\n");
1898 : :
1899 : : /* Determine what the primary and alternate exit conds are. */
1900 : 567343 : for (unsigned i = 0; i < info->conds.length (); i++)
1901 : : {
1902 : 326986 : gcond *cond = info->conds[i];
1903 : 326986 : if (exit_e->src == gimple_bb (cond))
1904 : 240357 : std::swap (info->conds[0], info->conds[i]);
1905 : : }
1906 : :
1907 : 240357 : if (integer_zerop (info->assumptions)
1908 : 240357 : || !info->number_of_iterations
1909 : 480714 : || chrec_contains_undetermined (info->number_of_iterations))
1910 : 35716 : return opt_result::failure_at
1911 : 35716 : (info->conds[0],
1912 : : "not vectorized: number of iterations cannot be computed.\n");
1913 : :
1914 : 204641 : if (integer_zerop (info->number_of_iterations))
1915 : 16 : return opt_result::failure_at
1916 : 16 : (info->conds[0],
1917 : : "not vectorized: number of iterations = 0.\n");
1918 : :
1919 : 204625 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1920 : 111142 : && tree_to_shwi (info->number_of_iterations) > 0))
1921 : : {
1922 : 93483 : if (dump_enabled_p ())
1923 : : {
1924 : 2194 : dump_printf_loc (MSG_NOTE, vect_location,
1925 : : "Symbolic number of iterations is ");
1926 : 2194 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1927 : 2194 : dump_printf (MSG_NOTE, "\n");
1928 : : }
1929 : : }
1930 : :
1931 : 204625 : return opt_result::success ();
1932 : 240433 : }
1933 : :
1934 : : /* Create a loop_vec_info for LOOP with SHARED and the
1935 : : vect_analyze_loop_form result. */
1936 : :
1937 : : loop_vec_info
1938 : 390202 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1939 : : const vect_loop_form_info *info,
1940 : : loop_vec_info orig_loop_info)
1941 : : {
1942 : 390202 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1943 : 390202 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1944 : 390202 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1945 : 390202 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1946 : 390202 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1947 : 390202 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1948 : 136 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1949 : 136 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1950 : : else
1951 : 390066 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1952 : : /* Also record the assumptions for versioning. */
1953 : 390202 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1954 : 18152 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1955 : :
1956 : 1754062 : for (gcond *cond : info->conds)
1957 : : {
1958 : 583456 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1959 : 583456 : STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1960 : : /* Mark the statement as a condition. */
1961 : 583456 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1962 : : }
1963 : :
1964 : 583456 : for (unsigned i = 1; i < info->conds.length (); i ++)
1965 : 193254 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1966 : 390202 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1967 : :
1968 : 390202 : LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1969 : :
1970 : : /* Check to see if we're vectorizing multiple exits. */
1971 : 390202 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1972 : 390202 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1973 : :
1974 : 390202 : if (info->inner_loop_cond)
1975 : : {
1976 : 8225 : stmt_vec_info inner_loop_cond_info
1977 : 8225 : = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1978 : 8225 : STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1979 : : /* If we have an estimate on the number of iterations of the inner
1980 : : loop use that to limit the scale for costing, otherwise use
1981 : : --param vect-inner-loop-cost-factor literally. */
1982 : 8225 : widest_int nit;
1983 : 8225 : if (estimated_stmt_executions (loop->inner, &nit))
1984 : 7035 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1985 : 7035 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1986 : 8225 : }
1987 : :
1988 : 390202 : return loop_vinfo;
1989 : : }
1990 : :
1991 : :
1992 : :
1993 : : /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1994 : : statements update the vectorization factor. */
1995 : :
1996 : : static void
1997 : 316880 : vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1998 : : {
1999 : 316880 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2000 : 316880 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2001 : 316880 : int nbbs = loop->num_nodes;
2002 : 316880 : poly_uint64 vectorization_factor;
2003 : 316880 : int i;
2004 : :
2005 : 316880 : DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
2006 : :
2007 : 316880 : vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2008 : 316880 : gcc_assert (known_ne (vectorization_factor, 0U));
2009 : :
2010 : : /* If all the stmts in the loop can be SLPed, we perform only SLP, and
2011 : : vectorization factor of the loop is the unrolling factor required by
2012 : : the SLP instances. If that unrolling factor is 1, we say, that we
2013 : : perform pure SLP on loop - cross iteration parallelism is not
2014 : : exploited. */
2015 : : bool only_slp_in_loop = true;
2016 : 1124980 : for (i = 0; i < nbbs; i++)
2017 : : {
2018 : 808100 : basic_block bb = bbs[i];
2019 : 1610911 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2020 : 802811 : gsi_next (&si))
2021 : : {
2022 : 802811 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
2023 : 802811 : if (!stmt_info)
2024 : 0 : continue;
2025 : 802811 : if ((STMT_VINFO_RELEVANT_P (stmt_info)
2026 : 425053 : || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2027 : 377770 : && !PURE_SLP_STMT (stmt_info))
2028 : : /* STMT needs both SLP and loop-based vectorization. */
2029 : 802811 : only_slp_in_loop = false;
2030 : : }
2031 : 5999664 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2032 : 4383464 : gsi_next (&si))
2033 : : {
2034 : 4383464 : if (is_gimple_debug (gsi_stmt (si)))
2035 : 1255965 : continue;
2036 : 3127499 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2037 : 3127499 : stmt_info = vect_stmt_to_vectorize (stmt_info);
2038 : 3127499 : if ((STMT_VINFO_RELEVANT_P (stmt_info)
2039 : 1326651 : || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2040 : 1800852 : && !PURE_SLP_STMT (stmt_info))
2041 : : /* STMT needs both SLP and loop-based vectorization. */
2042 : 4383464 : only_slp_in_loop = false;
2043 : : }
2044 : : }
2045 : :
2046 : 316880 : if (only_slp_in_loop)
2047 : : {
2048 : 308584 : if (dump_enabled_p ())
2049 : 19495 : dump_printf_loc (MSG_NOTE, vect_location,
2050 : : "Loop contains only SLP stmts\n");
2051 : 308584 : vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2052 : : }
2053 : : else
2054 : : {
2055 : 8296 : if (dump_enabled_p ())
2056 : 252 : dump_printf_loc (MSG_NOTE, vect_location,
2057 : : "Loop contains SLP and non-SLP stmts\n");
2058 : : /* Both the vectorization factor and unroll factor have the form
2059 : : GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2060 : : so they must have a common multiple. */
2061 : 8296 : vectorization_factor
2062 : 8296 : = force_common_multiple (vectorization_factor,
2063 : 8296 : LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2064 : : }
2065 : :
2066 : 316880 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2067 : 316880 : if (dump_enabled_p ())
2068 : : {
2069 : 19747 : dump_printf_loc (MSG_NOTE, vect_location,
2070 : : "Updating vectorization factor to ");
2071 : 19747 : dump_dec (MSG_NOTE, vectorization_factor);
2072 : 19747 : dump_printf (MSG_NOTE, ".\n");
2073 : : }
2074 : 316880 : }
2075 : :
2076 : : /* Return true if STMT_INFO describes a double reduction phi and if
2077 : : the other phi in the reduction is also relevant for vectorization.
2078 : : This rejects cases such as:
2079 : :
2080 : : outer1:
2081 : : x_1 = PHI <x_3(outer2), ...>;
2082 : : ...
2083 : :
2084 : : inner:
2085 : : x_2 = ...;
2086 : : ...
2087 : :
2088 : : outer2:
2089 : : x_3 = PHI <x_2(inner)>;
2090 : :
2091 : : if nothing in x_2 or elsewhere makes x_1 relevant. */
2092 : :
2093 : : static bool
2094 : 161 : vect_active_double_reduction_p (stmt_vec_info stmt_info)
2095 : : {
2096 : 161 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2097 : : return false;
2098 : :
2099 : 0 : return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2100 : : }
2101 : :
2102 : : /* Function vect_analyze_loop_operations.
2103 : :
2104 : : Scan the loop stmts and make sure they are all vectorizable. */
2105 : :
2106 : : static opt_result
2107 : 113091 : vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2108 : : {
2109 : 113091 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2110 : 113091 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2111 : 113091 : int nbbs = loop->num_nodes;
2112 : 113091 : int i;
2113 : 113091 : stmt_vec_info stmt_info;
2114 : 113091 : bool need_to_vectorize = false;
2115 : 113091 : bool ok;
2116 : :
2117 : 113091 : DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2118 : :
2119 : 113091 : auto_vec<stmt_info_for_cost> cost_vec;
2120 : :
2121 : 340978 : for (i = 0; i < nbbs; i++)
2122 : : {
2123 : 229251 : basic_block bb = bbs[i];
2124 : :
2125 : 554090 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2126 : 324839 : gsi_next (&si))
2127 : : {
2128 : 325593 : gphi *phi = si.phi ();
2129 : 325593 : ok = true;
2130 : :
2131 : 325593 : stmt_info = loop_vinfo->lookup_stmt (phi);
2132 : 325593 : if (dump_enabled_p ())
2133 : 39656 : dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2134 : : (gimple *) phi);
2135 : 651186 : if (virtual_operand_p (gimple_phi_result (phi)))
2136 : 89072 : continue;
2137 : :
2138 : : /* Inner-loop loop-closed exit phi in outer-loop vectorization
2139 : : (i.e., a phi in the tail of the outer-loop). */
2140 : 236521 : if (! is_loop_header_bb_p (bb))
2141 : : {
2142 : : /* FORNOW: we currently don't support the case that these phis
2143 : : are not used in the outerloop (unless it is double reduction,
2144 : : i.e., this phi is vect_reduction_def), cause this case
2145 : : requires to actually do something here. */
2146 : 753 : if (STMT_VINFO_LIVE_P (stmt_info)
2147 : 860 : && !vect_active_double_reduction_p (stmt_info))
2148 : 54 : return opt_result::failure_at (phi,
2149 : : "Unsupported loop-closed phi"
2150 : : " in outer-loop.\n");
2151 : :
2152 : : /* If PHI is used in the outer loop, we check that its operand
2153 : : is defined in the inner loop. */
2154 : 699 : if (STMT_VINFO_RELEVANT_P (stmt_info))
2155 : : {
2156 : 695 : tree phi_op;
2157 : :
2158 : 695 : if (gimple_phi_num_args (phi) != 1)
2159 : 0 : return opt_result::failure_at (phi, "unsupported phi");
2160 : :
2161 : 695 : phi_op = PHI_ARG_DEF (phi, 0);
2162 : 695 : stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2163 : 695 : if (!op_def_info)
2164 : 0 : return opt_result::failure_at (phi, "unsupported phi\n");
2165 : :
2166 : 695 : if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2167 : 695 : && (STMT_VINFO_RELEVANT (op_def_info)
2168 : : != vect_used_in_outer_by_reduction))
2169 : 236 : return opt_result::failure_at (phi, "unsupported phi\n");
2170 : :
2171 : 459 : if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2172 : 107 : || (STMT_VINFO_DEF_TYPE (stmt_info)
2173 : : == vect_double_reduction_def))
2174 : 459 : && ! PURE_SLP_STMT (stmt_info)
2175 : 459 : && !vectorizable_lc_phi (loop_vinfo,
2176 : : stmt_info, NULL, NULL))
2177 : 0 : return opt_result::failure_at (phi, "unsupported phi\n");
2178 : : }
2179 : :
2180 : 463 : continue;
2181 : 463 : }
2182 : :
2183 : 235768 : gcc_assert (stmt_info);
2184 : :
2185 : 235768 : if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2186 : 214117 : || STMT_VINFO_LIVE_P (stmt_info))
2187 : 25384 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2188 : 194 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2189 : : /* A scalar-dependence cycle that we don't support. */
2190 : 10 : return opt_result::failure_at (phi,
2191 : : "not vectorized:"
2192 : : " scalar dependence cycle.\n");
2193 : :
2194 : 235758 : if (STMT_VINFO_RELEVANT_P (stmt_info))
2195 : : {
2196 : 74194 : need_to_vectorize = true;
2197 : 74194 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2198 : 30849 : && ! PURE_SLP_STMT (stmt_info))
2199 : 1058 : ok = vectorizable_induction (loop_vinfo,
2200 : : stmt_info, NULL, NULL,
2201 : : &cost_vec);
2202 : 73136 : else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2203 : : || (STMT_VINFO_DEF_TYPE (stmt_info)
2204 : : == vect_double_reduction_def)
2205 : 73136 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2206 : 43137 : && ! PURE_SLP_STMT (stmt_info))
2207 : 54 : ok = vectorizable_reduction (loop_vinfo,
2208 : : stmt_info, NULL, NULL, &cost_vec);
2209 : 73082 : else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2210 : : == vect_first_order_recurrence)
2211 : 208 : && ! PURE_SLP_STMT (stmt_info))
2212 : 0 : ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2213 : : &cost_vec);
2214 : : }
2215 : :
2216 : : /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2217 : 1112 : if (ok
2218 : 235304 : && STMT_VINFO_LIVE_P (stmt_info)
2219 : 4651 : && !PURE_SLP_STMT (stmt_info))
2220 : 548 : ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2221 : : -1, false, &cost_vec);
2222 : :
2223 : 235304 : if (!ok)
2224 : 454 : return opt_result::failure_at (phi,
2225 : : "not vectorized: relevant phi not "
2226 : : "supported: %G",
2227 : : static_cast <gimple *> (phi));
2228 : : }
2229 : :
2230 : 1999958 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2231 : 1542964 : gsi_next (&si))
2232 : : {
2233 : 1543574 : gimple *stmt = gsi_stmt (si);
2234 : 1543574 : if (!gimple_clobber_p (stmt)
2235 : 1543574 : && !is_gimple_debug (stmt))
2236 : : {
2237 : 1235033 : opt_result res
2238 : 1235033 : = vect_analyze_stmt (loop_vinfo,
2239 : : loop_vinfo->lookup_stmt (stmt),
2240 : : &need_to_vectorize,
2241 : : NULL, NULL, &cost_vec);
2242 : 1235033 : if (!res)
2243 : 610 : return res;
2244 : : }
2245 : : }
2246 : : } /* bbs */
2247 : :
2248 : 111727 : add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2249 : :
2250 : : /* All operations in the loop are either irrelevant (deal with loop
2251 : : control, or dead), or only used outside the loop and can be moved
2252 : : out of the loop (e.g. invariants, inductions). The loop can be
2253 : : optimized away by scalar optimizations. We're better off not
2254 : : touching this loop. */
2255 : 111727 : if (!need_to_vectorize)
2256 : : {
2257 : 0 : if (dump_enabled_p ())
2258 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2259 : : "All the computation can be taken out of the loop.\n");
2260 : 0 : return opt_result::failure_at
2261 : 0 : (vect_location,
2262 : : "not vectorized: redundant loop. no profit to vectorize.\n");
2263 : : }
2264 : :
2265 : 111727 : return opt_result::success ();
2266 : 113091 : }
2267 : :
2268 : : /* Return true if we know that the iteration count is smaller than the
2269 : : vectorization factor. Return false if it isn't, or if we can't be sure
2270 : : either way. */
2271 : :
2272 : : static bool
2273 : 100285 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2274 : : {
2275 : 100285 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2276 : :
2277 : 100285 : HOST_WIDE_INT max_niter;
2278 : 100285 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2279 : 48437 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2280 : : else
2281 : 51848 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2282 : :
2283 : 100285 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2284 : 7761 : return true;
2285 : :
2286 : : return false;
2287 : : }
2288 : :
2289 : : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2290 : : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2291 : : definitely no, or -1 if it's worth retrying. */
2292 : :
2293 : : static int
2294 : 100291 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2295 : : unsigned *suggested_unroll_factor)
2296 : : {
2297 : 100291 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2298 : 100291 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2299 : :
2300 : : /* Only loops that can handle partially-populated vectors can have iteration
2301 : : counts less than the vectorization factor. */
2302 : 100291 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2303 : 100291 : && vect_known_niters_smaller_than_vf (loop_vinfo))
2304 : : {
2305 : 7753 : if (dump_enabled_p ())
2306 : 214 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2307 : : "not vectorized: iteration count smaller than "
2308 : : "vectorization factor.\n");
2309 : 7753 : return 0;
2310 : : }
2311 : :
2312 : : /* If we know the number of iterations we can do better, for the
2313 : : epilogue we can also decide whether the main loop leaves us
2314 : : with enough iterations, prefering a smaller vector epilog then
2315 : : also possibly used for the case we skip the vector loop. */
2316 : 92538 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2317 : : {
2318 : 40983 : widest_int scalar_niters
2319 : 40983 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2320 : 40983 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2321 : : {
2322 : 2460 : loop_vec_info orig_loop_vinfo
2323 : : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2324 : 2460 : loop_vec_info main_loop_vinfo
2325 : : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
2326 : 2460 : unsigned lowest_vf
2327 : 2460 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2328 : 2460 : int prolog_peeling = 0;
2329 : 2460 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
2330 : 2460 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
2331 : 2460 : if (prolog_peeling >= 0
2332 : 2460 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2333 : : lowest_vf))
2334 : : {
2335 : 4906 : unsigned gap
2336 : 2453 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
2337 : 4906 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
2338 : 4906 : % lowest_vf + gap);
2339 : : }
2340 : : }
2341 : : /* Reject vectorizing for a single scalar iteration, even if
2342 : : we could in principle implement that using partial vectors. */
2343 : 40983 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2344 : 40983 : if (scalar_niters <= peeling_gap + 1)
2345 : : {
2346 : 686 : if (dump_enabled_p ())
2347 : 162 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2348 : : "not vectorized: loop only has a single "
2349 : : "scalar iteration.\n");
2350 : 686 : return 0;
2351 : : }
2352 : :
2353 : 40297 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2354 : : {
2355 : : /* Check that the loop processes at least one full vector. */
2356 : 40288 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2357 : 40288 : if (known_lt (scalar_niters, vf))
2358 : : {
2359 : 347 : if (dump_enabled_p ())
2360 : 290 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361 : : "loop does not have enough iterations "
2362 : : "to support vectorization.\n");
2363 : 385 : return 0;
2364 : : }
2365 : :
2366 : : /* If we need to peel an extra epilogue iteration to handle data
2367 : : accesses with gaps, check that there are enough scalar iterations
2368 : : available.
2369 : :
2370 : : The check above is redundant with this one when peeling for gaps,
2371 : : but the distinction is useful for diagnostics. */
2372 : 39941 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2373 : 40249 : && known_le (scalar_niters, vf))
2374 : : {
2375 : 38 : if (dump_enabled_p ())
2376 : 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377 : : "loop does not have enough iterations "
2378 : : "to support peeling for gaps.\n");
2379 : 38 : return 0;
2380 : : }
2381 : : }
2382 : 40983 : }
2383 : :
2384 : : /* If using the "very cheap" model. reject cases in which we'd keep
2385 : : a copy of the scalar code (even if we might be able to vectorize it). */
2386 : 91467 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2387 : 91467 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2388 : 45152 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2389 : : {
2390 : 676 : if (dump_enabled_p ())
2391 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2392 : : "some scalar iterations would need to be peeled\n");
2393 : 676 : return 0;
2394 : : }
2395 : :
2396 : 90791 : int min_profitable_iters, min_profitable_estimate;
2397 : 90791 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2398 : : &min_profitable_estimate,
2399 : : suggested_unroll_factor);
2400 : :
2401 : 90791 : if (min_profitable_iters < 0)
2402 : : {
2403 : 23792 : if (dump_enabled_p ())
2404 : 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2405 : : "not vectorized: vectorization not profitable.\n");
2406 : 23792 : if (dump_enabled_p ())
2407 : 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2408 : : "not vectorized: vector version will never be "
2409 : : "profitable.\n");
2410 : 23792 : return -1;
2411 : : }
2412 : :
2413 : 66999 : int min_scalar_loop_bound = (param_min_vect_loop_bound
2414 : 66999 : * assumed_vf);
2415 : :
2416 : : /* Use the cost model only if it is more conservative than user specified
2417 : : threshold. */
2418 : 66999 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2419 : : min_profitable_iters);
2420 : :
2421 : 66999 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2422 : :
2423 : 34259 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2424 : 101258 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2425 : : {
2426 : 293 : if (dump_enabled_p ())
2427 : 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2428 : : "not vectorized: vectorization not profitable.\n");
2429 : 293 : if (dump_enabled_p ())
2430 : 1 : dump_printf_loc (MSG_NOTE, vect_location,
2431 : : "not vectorized: iteration count smaller than user "
2432 : : "specified loop bound parameter or minimum profitable "
2433 : : "iterations (whichever is more conservative).\n");
2434 : 293 : return 0;
2435 : : }
2436 : :
2437 : : /* The static profitablity threshold min_profitable_estimate includes
2438 : : the cost of having to check at runtime whether the scalar loop
2439 : : should be used instead. If it turns out that we don't need or want
2440 : : such a check, the threshold we should use for the static estimate
2441 : : is simply the point at which the vector loop becomes more profitable
2442 : : than the scalar loop. */
2443 : 66706 : if (min_profitable_estimate > min_profitable_iters
2444 : 13520 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2445 : 13100 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2446 : 259 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2447 : 66965 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2448 : : {
2449 : 3 : if (dump_enabled_p ())
2450 : 0 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2451 : : " choice between the scalar and vector loops\n");
2452 : 3 : min_profitable_estimate = min_profitable_iters;
2453 : : }
2454 : :
2455 : : /* If the vector loop needs multiple iterations to be beneficial then
2456 : : things are probably too close to call, and the conservative thing
2457 : : would be to stick with the scalar code. */
2458 : 66706 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2459 : 66706 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2460 : : {
2461 : 7381 : if (dump_enabled_p ())
2462 : 177 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2463 : : "one iteration of the vector loop would be"
2464 : : " more expensive than the equivalent number of"
2465 : : " iterations of the scalar loop\n");
2466 : 7381 : return 0;
2467 : : }
2468 : :
2469 : 59325 : HOST_WIDE_INT estimated_niter;
2470 : :
2471 : : /* If we are vectorizing an epilogue then we know the maximum number of
2472 : : scalar iterations it will cover is at least one lower than the
2473 : : vectorization factor of the main loop. */
2474 : 59325 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2475 : 10106 : estimated_niter
2476 : 10106 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2477 : : else
2478 : : {
2479 : 49219 : estimated_niter = estimated_stmt_executions_int (loop);
2480 : 49219 : if (estimated_niter == -1)
2481 : 18534 : estimated_niter = likely_max_stmt_executions_int (loop);
2482 : : }
2483 : 28640 : if (estimated_niter != -1
2484 : 57777 : && ((unsigned HOST_WIDE_INT) estimated_niter
2485 : 57777 : < MAX (th, (unsigned) min_profitable_estimate)))
2486 : : {
2487 : 4183 : if (dump_enabled_p ())
2488 : 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2489 : : "not vectorized: estimated iteration count too "
2490 : : "small.\n");
2491 : 4183 : if (dump_enabled_p ())
2492 : 28 : dump_printf_loc (MSG_NOTE, vect_location,
2493 : : "not vectorized: estimated iteration count smaller "
2494 : : "than specified loop bound parameter or minimum "
2495 : : "profitable iterations (whichever is more "
2496 : : "conservative).\n");
2497 : 4183 : return -1;
2498 : : }
2499 : :
2500 : : return 1;
2501 : : }
2502 : :
2503 : : static opt_result
2504 : 202655 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2505 : : vec<data_reference_p> *datarefs)
2506 : : {
2507 : 616322 : for (unsigned i = 0; i < loop->num_nodes; i++)
2508 : 908156 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2509 : 3287651 : !gsi_end_p (gsi); gsi_next (&gsi))
2510 : : {
2511 : 2873984 : gimple *stmt = gsi_stmt (gsi);
2512 : 2873984 : if (is_gimple_debug (stmt))
2513 : 929351 : continue;
2514 : 1944767 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2515 : : NULL, 0);
2516 : 1944767 : if (!res)
2517 : : {
2518 : 40545 : if (is_gimple_call (stmt) && loop->safelen)
2519 : : {
2520 : 400 : tree fndecl = gimple_call_fndecl (stmt), op;
2521 : 400 : if (fndecl == NULL_TREE
2522 : 400 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2523 : : {
2524 : 0 : fndecl = gimple_call_arg (stmt, 0);
2525 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2526 : 0 : fndecl = TREE_OPERAND (fndecl, 0);
2527 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2528 : : }
2529 : 400 : if (fndecl != NULL_TREE)
2530 : : {
2531 : 366 : cgraph_node *node = cgraph_node::get (fndecl);
2532 : 366 : if (node != NULL && node->simd_clones != NULL)
2533 : : {
2534 : 135 : unsigned int j, n = gimple_call_num_args (stmt);
2535 : 557 : for (j = 0; j < n; j++)
2536 : : {
2537 : 288 : op = gimple_call_arg (stmt, j);
2538 : 288 : if (DECL_P (op)
2539 : 288 : || (REFERENCE_CLASS_P (op)
2540 : 0 : && get_base_address (op)))
2541 : : break;
2542 : : }
2543 : 135 : op = gimple_call_lhs (stmt);
2544 : : /* Ignore #pragma omp declare simd functions
2545 : : if they don't have data references in the
2546 : : call stmt itself. */
2547 : 269 : if (j == n
2548 : 135 : && !(op
2549 : 124 : && (DECL_P (op)
2550 : 124 : || (REFERENCE_CLASS_P (op)
2551 : 0 : && get_base_address (op)))))
2552 : 134 : continue;
2553 : : }
2554 : : }
2555 : : }
2556 : 40411 : return res;
2557 : : }
2558 : : /* If dependence analysis will give up due to the limit on the
2559 : : number of datarefs stop here and fail fatally. */
2560 : 3334633 : if (datarefs->length ()
2561 : 1430411 : > (unsigned)param_loop_max_datarefs_for_datadeps)
2562 : 0 : return opt_result::failure_at (stmt, "exceeded param "
2563 : : "loop-max-datarefs-for-datadeps\n");
2564 : : }
2565 : 162244 : return opt_result::success ();
2566 : : }
2567 : :
2568 : : /* Look for SLP-only access groups and turn each individual access into its own
2569 : : group. */
2570 : : static void
2571 : 113091 : vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2572 : : {
2573 : 113091 : unsigned int i;
2574 : 113091 : struct data_reference *dr;
2575 : :
2576 : 113091 : DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2577 : :
2578 : 113091 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2579 : 522931 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2580 : : {
2581 : 301576 : gcc_assert (DR_REF (dr));
2582 : 301576 : stmt_vec_info stmt_info
2583 : 301576 : = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (DR_STMT (dr)));
2584 : :
2585 : : /* Check if the load is a part of an interleaving chain. */
2586 : 301576 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2587 : : {
2588 : 97460 : stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2589 : 97460 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2590 : 97460 : unsigned int group_size = DR_GROUP_SIZE (first_element);
2591 : :
2592 : : /* Check if SLP-only groups. */
2593 : 97460 : if (!STMT_SLP_TYPE (stmt_info)
2594 : 162 : && STMT_VINFO_SLP_VECT_ONLY (first_element))
2595 : : {
2596 : : /* Dissolve the group. */
2597 : 12 : STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2598 : :
2599 : 12 : stmt_vec_info vinfo = first_element;
2600 : 30 : while (vinfo)
2601 : : {
2602 : 18 : stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2603 : 18 : DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2604 : 18 : DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2605 : 18 : DR_GROUP_SIZE (vinfo) = 1;
2606 : 18 : if (STMT_VINFO_STRIDED_P (first_element)
2607 : : /* We cannot handle stores with gaps. */
2608 : 12 : || DR_IS_WRITE (dr_info->dr))
2609 : : {
2610 : 6 : STMT_VINFO_STRIDED_P (vinfo) = true;
2611 : 6 : DR_GROUP_GAP (vinfo) = 0;
2612 : : }
2613 : : else
2614 : 12 : DR_GROUP_GAP (vinfo) = group_size - 1;
2615 : : /* Duplicate and adjust alignment info, it needs to
2616 : : be present on each group leader, see dr_misalignment. */
2617 : 18 : if (vinfo != first_element)
2618 : : {
2619 : 6 : dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2620 : 6 : dr_info2->target_alignment = dr_info->target_alignment;
2621 : 6 : int misalignment = dr_info->misalignment;
2622 : 6 : if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2623 : : {
2624 : 0 : HOST_WIDE_INT diff
2625 : 0 : = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2626 : 0 : - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2627 : 0 : unsigned HOST_WIDE_INT align_c
2628 : 0 : = dr_info->target_alignment.to_constant ();
2629 : 0 : misalignment = (misalignment + diff) % align_c;
2630 : : }
2631 : 6 : dr_info2->misalignment = misalignment;
2632 : : }
2633 : : vinfo = next;
2634 : : }
2635 : : }
2636 : : }
2637 : : }
2638 : 113091 : }
2639 : :
2640 : : /* Determine if operating on full vectors for LOOP_VINFO might leave
2641 : : some scalar iterations still to do. If so, decide how we should
2642 : : handle those scalar iterations. The possibilities are:
2643 : :
2644 : : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2645 : : In this case:
2646 : :
2647 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2648 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2649 : : LOOP_VINFO_PEELING_FOR_NITER == false
2650 : :
2651 : : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2652 : : to handle the remaining scalar iterations. In this case:
2653 : :
2654 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2655 : : LOOP_VINFO_PEELING_FOR_NITER == true
2656 : :
2657 : : There are two choices:
2658 : :
2659 : : (2a) Consider vectorizing the epilogue loop at the same VF as the
2660 : : main loop, but using partial vectors instead of full vectors.
2661 : : In this case:
2662 : :
2663 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2664 : :
2665 : : (2b) Consider vectorizing the epilogue loop at lower VFs only.
2666 : : In this case:
2667 : :
2668 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2669 : : */
2670 : :
2671 : : opt_result
2672 : 118096 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2673 : : {
2674 : : /* Determine whether there would be any scalar iterations left over. */
2675 : 118096 : bool need_peeling_or_partial_vectors_p
2676 : 118096 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2677 : :
2678 : : /* Decide whether to vectorize the loop with partial vectors. */
2679 : 118096 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2680 : 118096 : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2681 : 118096 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2682 : 19 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2683 : 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2684 : 118096 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2685 : 19 : && need_peeling_or_partial_vectors_p)
2686 : : {
2687 : : /* For partial-vector-usage=1, try to push the handling of partial
2688 : : vectors to the epilogue, with the main loop continuing to operate
2689 : : on full vectors.
2690 : :
2691 : : If we are unrolling we also do not want to use partial vectors. This
2692 : : is to avoid the overhead of generating multiple masks and also to
2693 : : avoid having to execute entire iterations of FALSE masked instructions
2694 : : when dealing with one or less full iterations.
2695 : :
2696 : : ??? We could then end up failing to use partial vectors if we
2697 : : decide to peel iterations into a prologue, and if the main loop
2698 : : then ends up processing fewer than VF iterations. */
2699 : 14 : if ((param_vect_partial_vector_usage == 1
2700 : 6 : || loop_vinfo->suggested_unroll_factor > 1)
2701 : 8 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2702 : 22 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2703 : 0 : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2704 : : else
2705 : 14 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2706 : : }
2707 : :
2708 : 118096 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2709 : 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2710 : 0 : return opt_result::failure_at (vect_location,
2711 : : "not vectorized: loop needs but cannot "
2712 : : "use partial vectors\n");
2713 : :
2714 : 118096 : if (dump_enabled_p ())
2715 : 14164 : dump_printf_loc (MSG_NOTE, vect_location,
2716 : : "operating on %s vectors%s.\n",
2717 : 14164 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2718 : : ? "partial" : "full",
2719 : 14164 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2720 : : ? " for epilogue loop" : "");
2721 : :
2722 : 118096 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2723 : 236192 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2724 : 118096 : && need_peeling_or_partial_vectors_p);
2725 : :
2726 : : /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2727 : : analysis that we don't know whether the loop is vectorized by partial
2728 : : vectors (More details see tree-vect-loop-manip.cc).
2729 : :
2730 : : However, SELECT_VL vectorizaton style should only applied on partial
2731 : : vectorization since SELECT_VL is the GIMPLE IR that calculates the
2732 : : number of elements to be process for each iteration.
2733 : :
2734 : : After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2735 : : if it is not partial vectorized loop. */
2736 : 118096 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2737 : 118082 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2738 : :
2739 : 118096 : return opt_result::success ();
2740 : : }
2741 : :
2742 : : /* Function vect_analyze_loop_2.
2743 : :
2744 : : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2745 : : analyses will record information in some members of LOOP_VINFO. FATAL
2746 : : indicates if some analysis meets fatal error. If one non-NULL pointer
2747 : : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2748 : : worked out suggested unroll factor, while one NULL pointer shows it's
2749 : : going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2750 : : is to hold the slp decision when the suggested unroll factor is worked
2751 : : out. */
2752 : : static opt_result
2753 : 389463 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2754 : : unsigned *suggested_unroll_factor,
2755 : : unsigned& slp_done_for_suggested_uf)
2756 : : {
2757 : 389463 : opt_result ok = opt_result::success ();
2758 : 389463 : int res;
2759 : 389463 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2760 : 389463 : poly_uint64 min_vf = 2;
2761 : 389463 : loop_vec_info orig_loop_vinfo = NULL;
2762 : :
2763 : : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2764 : : loop_vec_info of the first vectorized loop. */
2765 : 389463 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2766 : 26768 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2767 : : else
2768 : : orig_loop_vinfo = loop_vinfo;
2769 : 26768 : gcc_assert (orig_loop_vinfo);
2770 : :
2771 : : /* The first group of checks is independent of the vector size. */
2772 : 389463 : fatal = true;
2773 : :
2774 : 389463 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2775 : 389463 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2776 : 5 : return opt_result::failure_at (vect_location,
2777 : : "not vectorized: simd if(0)\n");
2778 : :
2779 : : /* Find all data references in the loop (which correspond to vdefs/vuses)
2780 : : and analyze their evolution in the loop. */
2781 : :
2782 : 389458 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2783 : :
2784 : : /* Gather the data references and count stmts in the loop. */
2785 : 389458 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2786 : : {
2787 : 202655 : opt_result res
2788 : 202655 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2789 : : &LOOP_VINFO_DATAREFS (loop_vinfo));
2790 : 202655 : if (!res)
2791 : : {
2792 : 40411 : if (dump_enabled_p ())
2793 : 1465 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2794 : : "not vectorized: loop contains function "
2795 : : "calls or data references that cannot "
2796 : : "be analyzed\n");
2797 : 40411 : return res;
2798 : : }
2799 : 162244 : loop_vinfo->shared->save_datarefs ();
2800 : : }
2801 : : else
2802 : 186803 : loop_vinfo->shared->check_datarefs ();
2803 : :
2804 : : /* Analyze the data references and also adjust the minimal
2805 : : vectorization factor according to the loads and stores. */
2806 : :
2807 : 349047 : ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2808 : 349047 : if (!ok)
2809 : : {
2810 : 49986 : if (dump_enabled_p ())
2811 : 929 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2812 : : "bad data references.\n");
2813 : 49986 : return ok;
2814 : : }
2815 : :
2816 : : /* Check if we are applying unroll factor now. */
2817 : 299061 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2818 : 299061 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2819 : :
2820 : : /* If the slp decision is false when suggested unroll factor is worked
2821 : : out, and we are applying suggested unroll factor, we can simply skip
2822 : : all slp related analyses this time. */
2823 : 299061 : unsigned slp = !applying_suggested_uf ? 2 : slp_done_for_suggested_uf;
2824 : :
2825 : : /* Classify all cross-iteration scalar data-flow cycles.
2826 : : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2827 : 299061 : vect_analyze_scalar_cycles (loop_vinfo, slp == 2);
2828 : :
2829 : 299061 : vect_pattern_recog (loop_vinfo);
2830 : :
2831 : 299061 : vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2832 : :
2833 : : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2834 : : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2835 : :
2836 : 299061 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2837 : 299061 : if (!ok)
2838 : : {
2839 : 5924 : if (dump_enabled_p ())
2840 : 255 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2841 : : "bad data access.\n");
2842 : 5924 : return ok;
2843 : : }
2844 : :
2845 : : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2846 : :
2847 : 293137 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2848 : 293137 : if (!ok)
2849 : : {
2850 : 11972 : if (dump_enabled_p ())
2851 : 319 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2852 : : "unexpected pattern.\n");
2853 : 11972 : return ok;
2854 : : }
2855 : :
2856 : : /* While the rest of the analysis below depends on it in some way. */
2857 : 281165 : fatal = false;
2858 : :
2859 : : /* Analyze data dependences between the data-refs in the loop
2860 : : and adjust the maximum vectorization factor according to
2861 : : the dependences.
2862 : : FORNOW: fail at the first data dependence that we encounter. */
2863 : :
2864 : 281165 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2865 : 281165 : if (!ok)
2866 : : {
2867 : 20207 : if (dump_enabled_p ())
2868 : 368 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2869 : : "bad data dependence.\n");
2870 : 20207 : return ok;
2871 : : }
2872 : 260958 : if (max_vf != MAX_VECTORIZATION_FACTOR
2873 : 260958 : && maybe_lt (max_vf, min_vf))
2874 : 48 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2875 : 260910 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2876 : :
2877 : 260910 : ok = vect_determine_vectorization_factor (loop_vinfo);
2878 : 260910 : if (!ok)
2879 : : {
2880 : 48569 : if (dump_enabled_p ())
2881 : 748 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2882 : : "can't determine vectorization factor.\n");
2883 : 48569 : return ok;
2884 : : }
2885 : :
2886 : : /* Compute the scalar iteration cost. */
2887 : 212341 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2888 : :
2889 : 212341 : poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2890 : 212341 : bool saved_can_use_partial_vectors_p
2891 : : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2892 : :
2893 : : /* This is the point where we can re-start analysis with SLP forced off. */
2894 : 430800 : start_over:
2895 : :
2896 : 430800 : if (slp)
2897 : : {
2898 : : /* Check the SLP opportunities in the loop, analyze and build
2899 : : SLP trees. */
2900 : 643726 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2901 : : slp == 1);
2902 : 321863 : if (!ok)
2903 : 0 : return ok;
2904 : :
2905 : : /* If there are any SLP instances mark them as pure_slp. */
2906 : 321863 : if (vect_make_slp_decision (loop_vinfo))
2907 : : {
2908 : : /* Find stmts that need to be both vectorized and SLPed. */
2909 : 316880 : vect_detect_hybrid_slp (loop_vinfo);
2910 : :
2911 : : /* Update the vectorization factor based on the SLP decision. */
2912 : 316880 : vect_update_vf_for_slp (loop_vinfo);
2913 : :
2914 : : /* Optimize the SLP graph with the vectorization factor fixed. */
2915 : 316880 : vect_optimize_slp (loop_vinfo);
2916 : :
2917 : : /* Gather the loads reachable from the SLP graph entries. */
2918 : 316880 : vect_gather_slp_loads (loop_vinfo);
2919 : : }
2920 : : }
2921 : :
2922 : : /* We don't expect to have to roll back to anything other than an empty
2923 : : set of rgroups. */
2924 : 430800 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2925 : :
2926 : : /* When we arrive here with SLP disabled and we are supposed
2927 : : to use SLP for everything fail vectorization. */
2928 : 430800 : if (!slp && param_vect_force_slp)
2929 : 108937 : return opt_result::failure_at (vect_location,
2930 : : "may need non-SLP handling\n");
2931 : :
2932 : : /* Apply the suggested unrolling factor, this was determined by the backend
2933 : : during finish_cost the first time we ran the analyzis for this
2934 : : vector mode. */
2935 : 321863 : if (applying_suggested_uf)
2936 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2937 : :
2938 : : /* Now the vectorization factor is final. */
2939 : 321863 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2940 : 321863 : gcc_assert (known_ne (vectorization_factor, 0U));
2941 : :
2942 : 321863 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2943 : : {
2944 : 14634 : dump_printf_loc (MSG_NOTE, vect_location,
2945 : : "vectorization_factor = ");
2946 : 14634 : dump_dec (MSG_NOTE, vectorization_factor);
2947 : 14634 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2948 : 14634 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2949 : : }
2950 : :
2951 : 321863 : if (max_vf != MAX_VECTORIZATION_FACTOR
2952 : 321863 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2953 : 1 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2954 : :
2955 : 321862 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2956 : :
2957 : : /* Analyze the alignment of the data-refs in the loop.
2958 : : Fail if a data reference is found that cannot be vectorized. */
2959 : :
2960 : 321862 : ok = vect_analyze_data_refs_alignment (loop_vinfo);
2961 : 321862 : if (!ok)
2962 : : {
2963 : 7253 : if (dump_enabled_p ())
2964 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2965 : : "bad data alignment.\n");
2966 : 7253 : return ok;
2967 : : }
2968 : :
2969 : : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2970 : : It is important to call pruning after vect_analyze_data_ref_accesses,
2971 : : since we use grouping information gathered by interleaving analysis. */
2972 : 314609 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2973 : 314609 : if (!ok)
2974 : 11418 : return ok;
2975 : :
2976 : : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2977 : : vectorization, since we do not want to add extra peeling or
2978 : : add versioning for alignment. */
2979 : 303191 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2980 : : /* This pass will decide on using loop versioning and/or loop peeling in
2981 : : order to enhance the alignment of data references in the loop. */
2982 : 277138 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2983 : 303191 : if (!ok)
2984 : 0 : return ok;
2985 : :
2986 : 303191 : if (slp)
2987 : : {
2988 : : /* Analyze operations in the SLP instances. We can't simply
2989 : : remove unsupported SLP instances as this makes the above
2990 : : SLP kind detection invalid and might also affect the VF. */
2991 : 303191 : if (! vect_slp_analyze_operations (loop_vinfo))
2992 : : {
2993 : 190100 : ok = opt_result::failure_at (vect_location,
2994 : : "unsupported SLP instances\n");
2995 : 190100 : goto again;
2996 : : }
2997 : : }
2998 : :
2999 : : /* Dissolve SLP-only groups. */
3000 : 113091 : vect_dissolve_slp_only_groups (loop_vinfo);
3001 : :
3002 : : /* Scan all the remaining operations in the loop that are not subject
3003 : : to SLP and make sure they are vectorizable. */
3004 : 113091 : ok = vect_analyze_loop_operations (loop_vinfo);
3005 : 113091 : if (!ok)
3006 : : {
3007 : 1364 : ok = opt_result::failure_at (vect_location,
3008 : : "bad operation or unsupported loop bound\n");
3009 : 1364 : goto again;
3010 : : }
3011 : :
3012 : : /* For now, we don't expect to mix both masking and length approaches for one
3013 : : loop, disable it if both are recorded. */
3014 : 111727 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3015 : 21 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3016 : 111748 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3017 : : {
3018 : 0 : if (dump_enabled_p ())
3019 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3020 : : "can't vectorize a loop with partial vectors"
3021 : : " because we don't expect to mix different"
3022 : : " approaches with partial vectors for the"
3023 : : " same loop.\n");
3024 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3025 : : }
3026 : :
3027 : : /* If we still have the option of using partial vectors,
3028 : : check whether we can generate the necessary loop controls. */
3029 : 111727 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3030 : : {
3031 : 21 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3032 : : {
3033 : 21 : if (!vect_verify_full_masking (loop_vinfo)
3034 : 21 : && !vect_verify_full_masking_avx512 (loop_vinfo))
3035 : 2 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3036 : : }
3037 : : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3038 : 0 : if (!vect_verify_loop_lens (loop_vinfo))
3039 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3040 : : }
3041 : :
3042 : : /* If we're vectorizing a loop that uses length "controls" and
3043 : : can iterate more than once, we apply decrementing IV approach
3044 : : in loop control. */
3045 : 111727 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3046 : 19 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3047 : 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3048 : 111727 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3049 : 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3050 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3051 : 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3052 : :
3053 : : /* If a loop uses length controls and has a decrementing loop control IV,
3054 : : we will normally pass that IV through a MIN_EXPR to calcaluate the
3055 : : basis for the length controls. E.g. in a loop that processes one
3056 : : element per scalar iteration, the number of elements would be
3057 : : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3058 : :
3059 : : This MIN_EXPR approach allows us to use pointer IVs with an invariant
3060 : : step, since only the final iteration of the vector loop can have
3061 : : inactive lanes.
3062 : :
3063 : : However, some targets have a dedicated instruction for calculating the
3064 : : preferred length, given the total number of elements that still need to
3065 : : be processed. This is encapsulated in the SELECT_VL internal function.
3066 : :
3067 : : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3068 : : to determine the basis for the length controls. However, unlike the
3069 : : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3070 : : lanes inactive in any iteration of the vector loop, not just the last
3071 : : iteration. This SELECT_VL approach therefore requires us to use pointer
3072 : : IVs with variable steps.
3073 : :
3074 : : Once we've decided how many elements should be processed by one
3075 : : iteration of the vector loop, we need to populate the rgroup controls.
3076 : : If a loop has multiple rgroups, we need to make sure that those rgroups
3077 : : "line up" (that is, they must be consistent about which elements are
3078 : : active and which aren't). This is done by vect_adjust_loop_lens_control.
3079 : :
3080 : : In principle, it would be possible to use vect_adjust_loop_lens_control
3081 : : on either the result of a MIN_EXPR or the result of a SELECT_VL.
3082 : : However:
3083 : :
3084 : : (1) In practice, it only makes sense to use SELECT_VL when a vector
3085 : : operation will be controlled directly by the result. It is not
3086 : : worth using SELECT_VL if it would only be the input to other
3087 : : calculations.
3088 : :
3089 : : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3090 : : pointer IV will need N updates by a variable amount (N-1 updates
3091 : : within the iteration and 1 update to move to the next iteration).
3092 : :
3093 : : Because of this, we prefer to use the MIN_EXPR approach whenever there
3094 : : is more than one length control.
3095 : :
3096 : : In addition, SELECT_VL always operates to a granularity of 1 unit.
3097 : : If we wanted to use it to control an SLP operation on N consecutive
3098 : : elements, we would need to make the SELECT_VL inputs measure scalar
3099 : : iterations (rather than elements) and then multiply the SELECT_VL
3100 : : result by N. But using SELECT_VL this way is inefficient because
3101 : : of (1) above.
3102 : :
3103 : : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3104 : : satisfied:
3105 : :
3106 : : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3107 : : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3108 : :
3109 : : Since SELECT_VL (variable step) will make SCEV analysis failed and then
3110 : : we will fail to gain benefits of following unroll optimizations. We prefer
3111 : : using the MIN_EXPR approach in this situation. */
3112 : 111727 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3113 : : {
3114 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3115 : 0 : if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3116 : : OPTIMIZE_FOR_SPEED)
3117 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3118 : 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
3119 : 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3120 : : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3121 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3122 : :
3123 : : /* If any of the SLP instances cover more than a single lane
3124 : : we cannot use .SELECT_VL at the moment, even if the number
3125 : : of lanes is uniform throughout the SLP graph. */
3126 : 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
3127 : 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
3128 : 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
3129 : 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
3130 : 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
3131 : : {
3132 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
3133 : 0 : break;
3134 : : }
3135 : : }
3136 : :
3137 : : /* Decide whether this loop_vinfo should use partial vectors or peeling,
3138 : : assuming that the loop will be used as a main loop. We will redo
3139 : : this analysis later if we instead decide to use the loop as an
3140 : : epilogue loop. */
3141 : 111727 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3142 : 111727 : if (!ok)
3143 : 0 : return ok;
3144 : :
3145 : : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3146 : : to be able to handle fewer than VF scalars, or needs to have a lower VF
3147 : : than the main loop. */
3148 : 111727 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3149 : 22417 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3150 : : {
3151 : 22417 : poly_uint64 unscaled_vf
3152 : 22417 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3153 : : orig_loop_vinfo->suggested_unroll_factor);
3154 : 22417 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3155 : 10977 : return opt_result::failure_at (vect_location,
3156 : : "Vectorization factor too high for"
3157 : : " epilogue loop.\n");
3158 : : }
3159 : :
3160 : : /* If the epilogue needs peeling for gaps but the main loop doesn't give
3161 : : up on the epilogue. */
3162 : 100750 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3163 : 11440 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3164 : 52 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
3165 : : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
3166 : 0 : return opt_result::failure_at (vect_location,
3167 : : "Epilogue loop requires peeling for gaps "
3168 : : "but main loop does not.\n");
3169 : :
3170 : : /* If an epilogue loop is required make sure we can create one. */
3171 : 100750 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3172 : 99572 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3173 : 30412 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3174 : : {
3175 : 71364 : if (dump_enabled_p ())
3176 : 4817 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3177 : 71364 : if (!vect_can_advance_ivs_p (loop_vinfo)
3178 : 142269 : || !slpeel_can_duplicate_loop_p (loop,
3179 : : LOOP_VINFO_IV_EXIT (loop_vinfo),
3180 : 70905 : LOOP_VINFO_IV_EXIT (loop_vinfo)))
3181 : : {
3182 : 459 : ok = opt_result::failure_at (vect_location,
3183 : : "not vectorized: can't create required "
3184 : : "epilog loop\n");
3185 : 459 : goto again;
3186 : : }
3187 : : }
3188 : :
3189 : : /* Check the costings of the loop make vectorizing worthwhile. */
3190 : 100291 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3191 : 100291 : if (res < 0)
3192 : : {
3193 : 27975 : ok = opt_result::failure_at (vect_location,
3194 : : "Loop costings may not be worthwhile.\n");
3195 : 27975 : goto again;
3196 : : }
3197 : 72316 : if (!res)
3198 : 17174 : return opt_result::failure_at (vect_location,
3199 : : "Loop costings not worthwhile.\n");
3200 : :
3201 : : /* During peeling, we need to check if number of loop iterations is
3202 : : enough for both peeled prolog loop and vector loop. This check
3203 : : can be merged along with threshold check of loop versioning, so
3204 : : increase threshold for this case if necessary.
3205 : :
3206 : : If we are analyzing an epilogue we still want to check what its
3207 : : versioning threshold would be. If we decide to vectorize the epilogues we
3208 : : will want to use the lowest versioning threshold of all epilogues and main
3209 : : loop. This will enable us to enter a vectorized epilogue even when
3210 : : versioning the loop. We can't simply check whether the epilogue requires
3211 : : versioning though since we may have skipped some versioning checks when
3212 : : analyzing the epilogue. For instance, checks for alias versioning will be
3213 : : skipped when dealing with epilogues as we assume we already checked them
3214 : : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3215 : 55142 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3216 : : {
3217 : 5400 : poly_uint64 niters_th = 0;
3218 : 5400 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3219 : :
3220 : 5400 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3221 : : {
3222 : : /* Niters for peeled prolog loop. */
3223 : 5400 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3224 : : {
3225 : 78 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3226 : 78 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3227 : 78 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3228 : : }
3229 : : else
3230 : 5322 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3231 : : }
3232 : :
3233 : : /* Niters for at least one iteration of vectorized loop. */
3234 : 5400 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3235 : 5400 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3236 : : /* One additional iteration because of peeling for gap. */
3237 : 5400 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3238 : 70 : niters_th += 1;
3239 : :
3240 : : /* Use the same condition as vect_transform_loop to decide when to use
3241 : : the cost to determine a versioning threshold. */
3242 : 5400 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3243 : 5400 : && ordered_p (th, niters_th))
3244 : 3655 : niters_th = ordered_max (poly_uint64 (th), niters_th);
3245 : :
3246 : 5400 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3247 : : }
3248 : :
3249 : 55142 : gcc_assert (known_eq (vectorization_factor,
3250 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3251 : :
3252 : 55142 : slp_done_for_suggested_uf = slp;
3253 : :
3254 : : /* Ok to vectorize! */
3255 : 55142 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3256 : 55142 : return opt_result::success ();
3257 : :
3258 : 219898 : again:
3259 : : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3260 : 219898 : gcc_assert (!ok);
3261 : :
3262 : : /* Try again with SLP degraded but if we didn't do any SLP there is
3263 : : no point in re-trying. */
3264 : 219898 : if (!slp)
3265 : 0 : return ok;
3266 : :
3267 : : /* If we are applying suggested unroll factor, we don't need to
3268 : : re-try any more as we want to keep the SLP mode fixed. */
3269 : 219898 : if (applying_suggested_uf)
3270 : 0 : return ok;
3271 : :
3272 : : /* If there are reduction chains re-trying will fail anyway. */
3273 : 219898 : if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3274 : 197 : return ok;
3275 : :
3276 : : /* Likewise if the grouped loads or stores in the SLP cannot be handled
3277 : : via interleaving or lane instructions. */
3278 : : slp_instance instance;
3279 : : slp_tree node;
3280 : : unsigned i, j;
3281 : 761797 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3282 : : {
3283 : 543338 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
3284 : 0 : continue;
3285 : :
3286 : 543338 : stmt_vec_info vinfo;
3287 : 543338 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3288 : 543338 : if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3289 : 539065 : continue;
3290 : 4273 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3291 : 4273 : unsigned int size = DR_GROUP_SIZE (vinfo);
3292 : 4273 : tree vectype = STMT_VINFO_VECTYPE (vinfo);
3293 : 4273 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3294 : 7593 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3295 : 8308 : && ! vect_grouped_store_supported (vectype, size))
3296 : 715 : return opt_result::failure_at (vinfo->stmt,
3297 : : "unsupported grouped store\n");
3298 : 548142 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3299 : : {
3300 : 3754 : vinfo = SLP_TREE_REPRESENTATIVE (node);
3301 : 3754 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3302 : : {
3303 : 3151 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3304 : 3151 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3305 : 3151 : size = DR_GROUP_SIZE (vinfo);
3306 : 3151 : vectype = STMT_VINFO_VECTYPE (vinfo);
3307 : 3151 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3308 : 3151 : && ! vect_grouped_load_supported (vectype, single_element_p,
3309 : : size))
3310 : 527 : return opt_result::failure_at (vinfo->stmt,
3311 : : "unsupported grouped load\n");
3312 : : }
3313 : : }
3314 : : }
3315 : :
3316 : : /* Roll back state appropriately. Degrade SLP this time. From multi-
3317 : : to single-lane to disabled. */
3318 : 218459 : --slp;
3319 : 218459 : if (dump_enabled_p ())
3320 : : {
3321 : 7039 : if (slp)
3322 : 3657 : dump_printf_loc (MSG_NOTE, vect_location,
3323 : : "re-trying with single-lane SLP\n");
3324 : : else
3325 : 3382 : dump_printf_loc (MSG_NOTE, vect_location,
3326 : : "re-trying with SLP disabled\n");
3327 : : }
3328 : :
3329 : : /* Restore vectorization factor as it were without SLP. */
3330 : 218459 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3331 : : /* Free the SLP instances. */
3332 : 760532 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3333 : 542073 : vect_free_slp_instance (instance);
3334 : 218459 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3335 : : /* Reset SLP type to loop_vect on all stmts. */
3336 : 808397 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3337 : : {
3338 : 589938 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3339 : 589938 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
3340 : 1104855 : !gsi_end_p (si); gsi_next (&si))
3341 : : {
3342 : 514917 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3343 : 514917 : STMT_SLP_TYPE (stmt_info) = loop_vect;
3344 : 514917 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3345 : 514917 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3346 : : {
3347 : : /* vectorizable_reduction adjusts reduction stmt def-types,
3348 : : restore them to that of the PHI. */
3349 : 34831 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3350 : 34831 : = STMT_VINFO_DEF_TYPE (stmt_info);
3351 : 34831 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3352 : : (STMT_VINFO_REDUC_DEF (stmt_info)))
3353 : 34831 : = STMT_VINFO_DEF_TYPE (stmt_info);
3354 : : }
3355 : : }
3356 : 1179876 : for (gimple_stmt_iterator si = gsi_start_bb (bb);
3357 : 3453513 : !gsi_end_p (si); gsi_next (&si))
3358 : : {
3359 : 2863575 : if (is_gimple_debug (gsi_stmt (si)))
3360 : 922834 : continue;
3361 : 1940741 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3362 : 1940741 : STMT_SLP_TYPE (stmt_info) = loop_vect;
3363 : 1940741 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3364 : : {
3365 : 330765 : stmt_vec_info pattern_stmt_info
3366 : : = STMT_VINFO_RELATED_STMT (stmt_info);
3367 : 330765 : if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3368 : 0 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3369 : :
3370 : 330765 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3371 : 330765 : STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3372 : 330765 : for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3373 : 675196 : !gsi_end_p (pi); gsi_next (&pi))
3374 : 344431 : STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3375 : 344431 : = loop_vect;
3376 : : }
3377 : : }
3378 : : }
3379 : : /* Free optimized alias test DDRS. */
3380 : 218459 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3381 : 218459 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3382 : 218459 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3383 : : /* Reset target cost data. */
3384 : 218459 : delete loop_vinfo->vector_costs;
3385 : 218459 : loop_vinfo->vector_costs = nullptr;
3386 : : /* Reset accumulated rgroup information. */
3387 : 218459 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3388 : 218459 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3389 : 218459 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3390 : : /* Reset assorted flags. */
3391 : 218459 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3392 : 218459 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3393 : 218459 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3394 : 218459 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3395 : 218459 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3396 : 218459 : = saved_can_use_partial_vectors_p;
3397 : 218459 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3398 : 218459 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
3399 : 218459 : if (loop_vinfo->scan_map)
3400 : 244 : loop_vinfo->scan_map->empty ();
3401 : :
3402 : 218459 : goto start_over;
3403 : : }
3404 : :
3405 : : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3406 : : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3407 : : OLD_LOOP_VINFO is better unless something specifically indicates
3408 : : otherwise.
3409 : :
3410 : : Note that this deliberately isn't a partial order. */
3411 : :
3412 : : static bool
3413 : 0 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3414 : : loop_vec_info old_loop_vinfo)
3415 : : {
3416 : 0 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3417 : 0 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3418 : :
3419 : 0 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3420 : 0 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3421 : :
3422 : : /* Always prefer a VF of loop->simdlen over any other VF. */
3423 : 0 : if (loop->simdlen)
3424 : : {
3425 : 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3426 : 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3427 : 0 : if (new_simdlen_p != old_simdlen_p)
3428 : : return new_simdlen_p;
3429 : : }
3430 : :
3431 : 0 : const auto *old_costs = old_loop_vinfo->vector_costs;
3432 : 0 : const auto *new_costs = new_loop_vinfo->vector_costs;
3433 : 0 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3434 : 0 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3435 : :
3436 : 0 : return new_costs->better_main_loop_than_p (old_costs);
3437 : : }
3438 : :
3439 : : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3440 : : true if we should. */
3441 : :
3442 : : static bool
3443 : 0 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3444 : : loop_vec_info old_loop_vinfo)
3445 : : {
3446 : 0 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3447 : : return false;
3448 : :
3449 : 0 : if (dump_enabled_p ())
3450 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3451 : : "***** Preferring vector mode %s to vector mode %s\n",
3452 : 0 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
3453 : 0 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
3454 : : return true;
3455 : : }
3456 : :
3457 : : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
3458 : : not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3459 : : MODE_I to the next mode useful to analyze.
3460 : : Return the loop_vinfo on success and wrapped null on failure. */
3461 : :
3462 : : static opt_loop_vec_info
3463 : 389463 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3464 : : const vect_loop_form_info *loop_form_info,
3465 : : loop_vec_info orig_loop_vinfo,
3466 : : const vector_modes &vector_modes, unsigned &mode_i,
3467 : : machine_mode &autodetected_vector_mode,
3468 : : bool &fatal)
3469 : : {
3470 : 389463 : loop_vec_info loop_vinfo
3471 : 389463 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
3472 : :
3473 : 389463 : machine_mode vector_mode = vector_modes[mode_i];
3474 : 389463 : loop_vinfo->vector_mode = vector_mode;
3475 : 389463 : unsigned int suggested_unroll_factor = 1;
3476 : 389463 : unsigned slp_done_for_suggested_uf = 0;
3477 : :
3478 : : /* Run the main analysis. */
3479 : 389463 : opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3480 : : &suggested_unroll_factor,
3481 : : slp_done_for_suggested_uf);
3482 : 389463 : if (dump_enabled_p ())
3483 : 20508 : dump_printf_loc (MSG_NOTE, vect_location,
3484 : : "***** Analysis %s with vector mode %s\n",
3485 : 20508 : res ? "succeeded" : "failed",
3486 : 20508 : GET_MODE_NAME (loop_vinfo->vector_mode));
3487 : :
3488 : 389463 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) && suggested_unroll_factor > 1)
3489 : : {
3490 : 0 : if (dump_enabled_p ())
3491 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3492 : : "***** Re-trying analysis for unrolling"
3493 : : " with unroll factor %d and slp %s.\n",
3494 : : suggested_unroll_factor,
3495 : 0 : slp_done_for_suggested_uf ? "on" : "off");
3496 : 0 : loop_vec_info unroll_vinfo
3497 : 0 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
3498 : 0 : unroll_vinfo->vector_mode = vector_mode;
3499 : 0 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3500 : 0 : opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3501 : : slp_done_for_suggested_uf);
3502 : 0 : if (new_res)
3503 : : {
3504 : 0 : delete loop_vinfo;
3505 : 0 : loop_vinfo = unroll_vinfo;
3506 : : }
3507 : : else
3508 : 0 : delete unroll_vinfo;
3509 : : }
3510 : :
3511 : : /* Remember the autodetected vector mode. */
3512 : 389463 : if (vector_mode == VOIDmode)
3513 : 194466 : autodetected_vector_mode = loop_vinfo->vector_mode;
3514 : :
3515 : : /* Advance mode_i, first skipping modes that would result in the
3516 : : same analysis result. */
3517 : 1745855 : while (mode_i + 1 < vector_modes.length ()
3518 : 1221487 : && vect_chooses_same_modes_p (loop_vinfo,
3519 : 543291 : vector_modes[mode_i + 1]))
3520 : : {
3521 : 288733 : if (dump_enabled_p ())
3522 : 15278 : dump_printf_loc (MSG_NOTE, vect_location,
3523 : : "***** The result for vector mode %s would"
3524 : : " be the same\n",
3525 : 15278 : GET_MODE_NAME (vector_modes[mode_i + 1]));
3526 : 288733 : mode_i += 1;
3527 : : }
3528 : 389463 : if (mode_i + 1 < vector_modes.length ()
3529 : 254558 : && VECTOR_MODE_P (autodetected_vector_mode)
3530 : 509116 : && (related_vector_mode (vector_modes[mode_i + 1],
3531 : : GET_MODE_INNER (autodetected_vector_mode))
3532 : 254558 : == autodetected_vector_mode)
3533 : 644021 : && (related_vector_mode (autodetected_vector_mode,
3534 : 340 : GET_MODE_INNER (vector_modes[mode_i + 1]))
3535 : 680 : == vector_modes[mode_i + 1]))
3536 : : {
3537 : 340 : if (dump_enabled_p ())
3538 : 4 : dump_printf_loc (MSG_NOTE, vect_location,
3539 : : "***** Skipping vector mode %s, which would"
3540 : : " repeat the analysis for %s\n",
3541 : 4 : GET_MODE_NAME (vector_modes[mode_i + 1]),
3542 : 4 : GET_MODE_NAME (autodetected_vector_mode));
3543 : 340 : mode_i += 1;
3544 : : }
3545 : 389463 : mode_i++;
3546 : :
3547 : 389463 : if (!res)
3548 : : {
3549 : 334321 : delete loop_vinfo;
3550 : 334321 : if (fatal)
3551 : 58017 : gcc_checking_assert (orig_loop_vinfo == NULL);
3552 : 334321 : return opt_loop_vec_info::propagate_failure (res);
3553 : : }
3554 : :
3555 : 55142 : return opt_loop_vec_info::success (loop_vinfo);
3556 : : }
3557 : :
3558 : : /* Function vect_analyze_loop.
3559 : :
3560 : : Apply a set of analyses on LOOP, and create a loop_vec_info struct
3561 : : for it. The different analyses will record information in the
3562 : : loop_vec_info struct. */
3563 : : opt_loop_vec_info
3564 : 430889 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
3565 : : vec_info_shared *shared)
3566 : : {
3567 : 430889 : DUMP_VECT_SCOPE ("analyze_loop_nest");
3568 : :
3569 : 430889 : if (loop_outer (loop)
3570 : 430889 : && loop_vec_info_for_loop (loop_outer (loop))
3571 : 431295 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3572 : 406 : return opt_loop_vec_info::failure_at (vect_location,
3573 : : "outer-loop already vectorized.\n");
3574 : :
3575 : 430483 : if (!find_loop_nest (loop, &shared->loop_nest))
3576 : 19283 : return opt_loop_vec_info::failure_at
3577 : 19283 : (vect_location,
3578 : : "not vectorized: loop nest containing two or more consecutive inner"
3579 : : " loops cannot be vectorized\n");
3580 : :
3581 : : /* Analyze the loop form. */
3582 : 411200 : vect_loop_form_info loop_form_info;
3583 : 411200 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
3584 : : &loop_form_info);
3585 : 411200 : if (!res)
3586 : : {
3587 : 216734 : if (dump_enabled_p ())
3588 : 1587 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589 : : "bad loop form.\n");
3590 : 216734 : return opt_loop_vec_info::propagate_failure (res);
3591 : : }
3592 : 194466 : if (!integer_onep (loop_form_info.assumptions))
3593 : : {
3594 : : /* We consider to vectorize this loop by versioning it under
3595 : : some assumptions. In order to do this, we need to clear
3596 : : existing information computed by scev and niter analyzer. */
3597 : 8595 : scev_reset_htab ();
3598 : 8595 : free_numbers_of_iterations_estimates (loop);
3599 : : /* Also set flag for this loop so that following scev and niter
3600 : : analysis are done under the assumptions. */
3601 : 8595 : loop_constraint_set (loop, LOOP_C_FINITE);
3602 : : }
3603 : : else
3604 : : /* Clear the existing niter information to make sure the nonwrapping flag
3605 : : will be calculated and set propriately. */
3606 : 185871 : free_numbers_of_iterations_estimates (loop);
3607 : :
3608 : 194466 : auto_vector_modes vector_modes;
3609 : : /* Autodetect first vector size we try. */
3610 : 194466 : vector_modes.safe_push (VOIDmode);
3611 : 194466 : unsigned int autovec_flags
3612 : 388932 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3613 : 194466 : loop->simdlen != 0);
3614 : 194466 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3615 : 194466 : && !unlimited_cost_model (loop));
3616 : 194466 : machine_mode autodetected_vector_mode = VOIDmode;
3617 : 194466 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3618 : 194466 : unsigned int mode_i = 0;
3619 : 194466 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3620 : :
3621 : : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3622 : : a mode has not been analyzed. */
3623 : 194466 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
3624 : 1959132 : for (unsigned i = 0; i < vector_modes.length (); ++i)
3625 : 785100 : cached_vf_per_mode.safe_push (0);
3626 : :
3627 : : /* First determine the main loop vectorization mode, either the first
3628 : : one that works, starting with auto-detecting the vector mode and then
3629 : : following the targets order of preference, or the one with the
3630 : : lowest cost if pick_lowest_cost_p. */
3631 : 530924 : while (1)
3632 : : {
3633 : 362695 : bool fatal;
3634 : 362695 : unsigned int last_mode_i = mode_i;
3635 : : /* Set cached VF to -1 prior to analysis, which indicates a mode has
3636 : : failed. */
3637 : 362695 : cached_vf_per_mode[last_mode_i] = -1;
3638 : 362695 : opt_loop_vec_info loop_vinfo
3639 : 362695 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3640 : : NULL, vector_modes, mode_i,
3641 : : autodetected_vector_mode, fatal);
3642 : 362695 : if (fatal)
3643 : : break;
3644 : :
3645 : 304678 : if (loop_vinfo)
3646 : : {
3647 : : /* Analyzis has been successful so update the VF value. The
3648 : : VF should always be a multiple of unroll_factor and we want to
3649 : : capture the original VF here. */
3650 : 48773 : cached_vf_per_mode[last_mode_i]
3651 : 48773 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3652 : 48773 : loop_vinfo->suggested_unroll_factor);
3653 : : /* Once we hit the desired simdlen for the first time,
3654 : : discard any previous attempts. */
3655 : 48773 : if (simdlen
3656 : 48773 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3657 : : {
3658 : 47 : delete first_loop_vinfo;
3659 : : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3660 : : simdlen = 0;
3661 : : }
3662 : 48726 : else if (pick_lowest_cost_p
3663 : 0 : && first_loop_vinfo
3664 : 48726 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3665 : : {
3666 : : /* Pick loop_vinfo over first_loop_vinfo. */
3667 : 0 : delete first_loop_vinfo;
3668 : 0 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3669 : : }
3670 : 48773 : if (first_loop_vinfo == NULL)
3671 : : first_loop_vinfo = loop_vinfo;
3672 : : else
3673 : : {
3674 : 2 : delete loop_vinfo;
3675 : 2 : loop_vinfo = opt_loop_vec_info::success (NULL);
3676 : : }
3677 : :
3678 : : /* Commit to first_loop_vinfo if we have no reason to try
3679 : : alternatives. */
3680 : 48773 : if (!simdlen && !pick_lowest_cost_p)
3681 : : break;
3682 : : }
3683 : 255914 : if (mode_i == vector_modes.length ()
3684 : 255914 : || autodetected_vector_mode == VOIDmode)
3685 : : break;
3686 : :
3687 : : /* Try the next biggest vector size. */
3688 : 168229 : if (dump_enabled_p ())
3689 : 3620 : dump_printf_loc (MSG_NOTE, vect_location,
3690 : : "***** Re-trying analysis with vector mode %s\n",
3691 : 3620 : GET_MODE_NAME (vector_modes[mode_i]));
3692 : 168229 : }
3693 : 194466 : if (!first_loop_vinfo)
3694 : 145700 : return opt_loop_vec_info::propagate_failure (res);
3695 : :
3696 : 48766 : if (dump_enabled_p ())
3697 : 8661 : dump_printf_loc (MSG_NOTE, vect_location,
3698 : : "***** Choosing vector mode %s\n",
3699 : 8661 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3700 : :
3701 : : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3702 : : enabled, SIMDUID is not set, it is the innermost loop and we have
3703 : : either already found the loop's SIMDLEN or there was no SIMDLEN to
3704 : : begin with.
3705 : : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3706 : 48766 : bool vect_epilogues = (!simdlen
3707 : 48764 : && loop->inner == NULL
3708 : 48344 : && param_vect_epilogues_nomask
3709 : 47298 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3710 : : /* No code motion support for multiple epilogues so for now
3711 : : not supported when multiple exits. */
3712 : 23337 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3713 : 22931 : && !loop->simduid
3714 : 70287 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3715 : 48766 : if (!vect_epilogues)
3716 : 37400 : return first_loop_vinfo;
3717 : :
3718 : : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3719 : :
3720 : : /* For epilogues start the analysis from the first mode. The motivation
3721 : : behind starting from the beginning comes from cases where the VECTOR_MODES
3722 : : array may contain length-agnostic and length-specific modes. Their
3723 : : ordering is not guaranteed, so we could end up picking a mode for the main
3724 : : loop that is after the epilogue's optimal mode. */
3725 : 11366 : if (!unlimited_cost_model (loop)
3726 : 11366 : && first_loop_vinfo->vector_costs->suggested_epilogue_mode () != VOIDmode)
3727 : : {
3728 : 3 : vector_modes[0]
3729 : 3 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode ();
3730 : 3 : cached_vf_per_mode[0] = 0;
3731 : : }
3732 : : else
3733 : 11363 : vector_modes[0] = autodetected_vector_mode;
3734 : 11366 : mode_i = 0;
3735 : :
3736 : 11366 : bool supports_partial_vectors =
3737 : 11366 : partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3738 : 11366 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3739 : :
3740 : 11366 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3741 : 39188 : do
3742 : : {
3743 : 39112 : while (1)
3744 : : {
3745 : : /* If the target does not support partial vectors we can shorten the
3746 : : number of modes to analyze for the epilogue as we know we can't
3747 : : pick a mode that would lead to a VF at least as big as the
3748 : : FIRST_VINFO_VF. */
3749 : 51434 : if (!supports_partial_vectors
3750 : 39112 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3751 : : {
3752 : 12344 : mode_i++;
3753 : 24688 : if (mode_i == vector_modes.length ())
3754 : : break;
3755 : 12322 : continue;
3756 : : }
3757 : :
3758 : 26768 : if (dump_enabled_p ())
3759 : 4995 : dump_printf_loc (MSG_NOTE, vect_location,
3760 : : "***** Re-trying epilogue analysis with vector "
3761 : 4995 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3762 : :
3763 : 26768 : bool fatal;
3764 : 26768 : opt_loop_vec_info loop_vinfo
3765 : 26768 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3766 : : orig_loop_vinfo,
3767 : : vector_modes, mode_i,
3768 : : autodetected_vector_mode, fatal);
3769 : 26768 : if (fatal)
3770 : : break;
3771 : :
3772 : 26768 : if (loop_vinfo)
3773 : : {
3774 : 6369 : if (pick_lowest_cost_p
3775 : 0 : && orig_loop_vinfo->epilogue_vinfo
3776 : 6369 : && vect_joust_loop_vinfos (loop_vinfo,
3777 : : orig_loop_vinfo->epilogue_vinfo))
3778 : : {
3779 : 0 : gcc_assert (vect_epilogues);
3780 : 0 : delete orig_loop_vinfo->epilogue_vinfo;
3781 : 0 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3782 : : }
3783 : 6369 : if (!orig_loop_vinfo->epilogue_vinfo)
3784 : 6369 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3785 : : else
3786 : : {
3787 : 0 : delete loop_vinfo;
3788 : 0 : loop_vinfo = opt_loop_vec_info::success (NULL);
3789 : : }
3790 : :
3791 : : /* For now only allow one epilogue loop, but allow
3792 : : pick_lowest_cost_p to replace it, so commit to the
3793 : : first epilogue if we have no reason to try alternatives. */
3794 : 6369 : if (!pick_lowest_cost_p)
3795 : : break;
3796 : : }
3797 : :
3798 : 40798 : if (mode_i == vector_modes.length ())
3799 : : break;
3800 : : }
3801 : :
3802 : 11442 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3803 : 11442 : if (!orig_loop_vinfo)
3804 : : break;
3805 : :
3806 : : /* When we selected a first vectorized epilogue, see if the target
3807 : : suggests to have another one. */
3808 : 6369 : if (!unlimited_cost_model (loop)
3809 : 6369 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode ()
3810 : : != VOIDmode))
3811 : : {
3812 : 152 : vector_modes[0]
3813 : 76 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode ();
3814 : 76 : cached_vf_per_mode[0] = 0;
3815 : 76 : mode_i = 0;
3816 : : }
3817 : : else
3818 : : break;
3819 : 76 : }
3820 : : while (1);
3821 : :
3822 : 11366 : if (first_loop_vinfo->epilogue_vinfo)
3823 : : {
3824 : 6293 : poly_uint64 lowest_th
3825 : 6293 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3826 : 6293 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3827 : 6369 : do
3828 : : {
3829 : 6369 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3830 : 6369 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3831 : : || maybe_ne (lowest_th, 0U));
3832 : : /* Keep track of the known smallest versioning threshold. */
3833 : 6369 : if (ordered_p (lowest_th, th))
3834 : 6369 : lowest_th = ordered_min (lowest_th, th);
3835 : 6369 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3836 : : }
3837 : 6369 : while (epilog_vinfo);
3838 : 6293 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3839 : 6293 : if (dump_enabled_p ())
3840 : 1249 : dump_printf_loc (MSG_NOTE, vect_location,
3841 : : "***** Choosing epilogue vector mode %s\n",
3842 : 1249 : GET_MODE_NAME
3843 : : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3844 : : }
3845 : :
3846 : 11366 : return first_loop_vinfo;
3847 : 605666 : }
3848 : :
3849 : : /* Return true if there is an in-order reduction function for CODE, storing
3850 : : it in *REDUC_FN if so. */
3851 : :
3852 : : static bool
3853 : 4600 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3854 : : {
3855 : : /* We support MINUS_EXPR by negating the operand. This also preserves an
3856 : : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3857 : : (-0.0) = -0.0. */
3858 : 4600 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3859 : : {
3860 : 3916 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3861 : 0 : return true;
3862 : : }
3863 : : return false;
3864 : : }
3865 : :
3866 : : /* Function reduction_fn_for_scalar_code
3867 : :
3868 : : Input:
3869 : : CODE - tree_code of a reduction operations.
3870 : :
3871 : : Output:
3872 : : REDUC_FN - the corresponding internal function to be used to reduce the
3873 : : vector of partial results into a single scalar result, or IFN_LAST
3874 : : if the operation is a supported reduction operation, but does not have
3875 : : such an internal function.
3876 : :
3877 : : Return FALSE if CODE currently cannot be vectorized as reduction. */
3878 : :
3879 : : bool
3880 : 1855697 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3881 : : {
3882 : 1855697 : if (code.is_tree_code ())
3883 : 1855643 : switch (tree_code (code))
3884 : : {
3885 : 11504 : case MAX_EXPR:
3886 : 11504 : *reduc_fn = IFN_REDUC_MAX;
3887 : 11504 : return true;
3888 : :
3889 : 44160 : case MIN_EXPR:
3890 : 44160 : *reduc_fn = IFN_REDUC_MIN;
3891 : 44160 : return true;
3892 : :
3893 : 973671 : case PLUS_EXPR:
3894 : 973671 : *reduc_fn = IFN_REDUC_PLUS;
3895 : 973671 : return true;
3896 : :
3897 : 239392 : case BIT_AND_EXPR:
3898 : 239392 : *reduc_fn = IFN_REDUC_AND;
3899 : 239392 : return true;
3900 : :
3901 : 279738 : case BIT_IOR_EXPR:
3902 : 279738 : *reduc_fn = IFN_REDUC_IOR;
3903 : 279738 : return true;
3904 : :
3905 : 34851 : case BIT_XOR_EXPR:
3906 : 34851 : *reduc_fn = IFN_REDUC_XOR;
3907 : 34851 : return true;
3908 : :
3909 : 272327 : case MULT_EXPR:
3910 : 272327 : case MINUS_EXPR:
3911 : 272327 : *reduc_fn = IFN_LAST;
3912 : 272327 : return true;
3913 : :
3914 : : default:
3915 : : return false;
3916 : : }
3917 : : else
3918 : 54 : switch (combined_fn (code))
3919 : : {
3920 : 30 : CASE_CFN_FMAX:
3921 : 30 : *reduc_fn = IFN_REDUC_FMAX;
3922 : 30 : return true;
3923 : :
3924 : 24 : CASE_CFN_FMIN:
3925 : 24 : *reduc_fn = IFN_REDUC_FMIN;
3926 : 24 : return true;
3927 : :
3928 : : default:
3929 : : return false;
3930 : : }
3931 : : }
3932 : :
3933 : : /* If there is a neutral value X such that a reduction would not be affected
3934 : : by the introduction of additional X elements, return that X, otherwise
3935 : : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3936 : : of the scalar elements. If the reduction has just a single initial value
3937 : : then INITIAL_VALUE is that value, otherwise it is null.
3938 : : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3939 : : In that case no signed zero is returned. */
3940 : :
3941 : : tree
3942 : 77488 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3943 : : tree initial_value, bool as_initial)
3944 : : {
3945 : 77488 : if (code.is_tree_code ())
3946 : 77434 : switch (tree_code (code))
3947 : : {
3948 : 9627 : case DOT_PROD_EXPR:
3949 : 9627 : case SAD_EXPR:
3950 : 9627 : case MINUS_EXPR:
3951 : 9627 : case BIT_IOR_EXPR:
3952 : 9627 : case BIT_XOR_EXPR:
3953 : 9627 : return build_zero_cst (scalar_type);
3954 : 62440 : case WIDEN_SUM_EXPR:
3955 : 62440 : case PLUS_EXPR:
3956 : 62440 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3957 : 19 : return build_real (scalar_type, dconstm0);
3958 : : else
3959 : 62421 : return build_zero_cst (scalar_type);
3960 : :
3961 : 1538 : case MULT_EXPR:
3962 : 1538 : return build_one_cst (scalar_type);
3963 : :
3964 : 1341 : case BIT_AND_EXPR:
3965 : 1341 : return build_all_ones_cst (scalar_type);
3966 : :
3967 : : case MAX_EXPR:
3968 : : case MIN_EXPR:
3969 : : return initial_value;
3970 : :
3971 : 387 : default:
3972 : 387 : return NULL_TREE;
3973 : : }
3974 : : else
3975 : 54 : switch (combined_fn (code))
3976 : : {
3977 : : CASE_CFN_FMIN:
3978 : : CASE_CFN_FMAX:
3979 : : return initial_value;
3980 : :
3981 : 0 : default:
3982 : 0 : return NULL_TREE;
3983 : : }
3984 : : }
3985 : :
3986 : : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3987 : : STMT is printed with a message MSG. */
3988 : :
3989 : : static void
3990 : 467 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3991 : : {
3992 : 467 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3993 : 467 : }
3994 : :
3995 : : /* Return true if we need an in-order reduction for operation CODE
3996 : : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3997 : : overflow must wrap. */
3998 : :
3999 : : bool
4000 : 5872847 : needs_fold_left_reduction_p (tree type, code_helper code)
4001 : : {
4002 : : /* CHECKME: check for !flag_finite_math_only too? */
4003 : 5872847 : if (SCALAR_FLOAT_TYPE_P (type))
4004 : : {
4005 : 530097 : if (code.is_tree_code ())
4006 : 530048 : switch (tree_code (code))
4007 : : {
4008 : : case MIN_EXPR:
4009 : : case MAX_EXPR:
4010 : : return false;
4011 : :
4012 : 528597 : default:
4013 : 528597 : return !flag_associative_math;
4014 : : }
4015 : : else
4016 : 49 : switch (combined_fn (code))
4017 : : {
4018 : : CASE_CFN_FMIN:
4019 : : CASE_CFN_FMAX:
4020 : : return false;
4021 : :
4022 : 1 : default:
4023 : 1 : return !flag_associative_math;
4024 : : }
4025 : : }
4026 : :
4027 : 5342750 : if (INTEGRAL_TYPE_P (type))
4028 : 5341955 : return (!code.is_tree_code ()
4029 : 5341955 : || !operation_no_trapping_overflow (type, tree_code (code)));
4030 : :
4031 : 795 : if (SAT_FIXED_POINT_TYPE_P (type))
4032 : : return true;
4033 : :
4034 : : return false;
4035 : : }
4036 : :
4037 : : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4038 : : has a handled computation expression. Store the main reduction
4039 : : operation in *CODE. */
4040 : :
4041 : : static bool
4042 : 67944 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4043 : : tree loop_arg, code_helper *code,
4044 : : vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4045 : : {
4046 : 67944 : auto_bitmap visited;
4047 : 67944 : tree lookfor = PHI_RESULT (phi);
4048 : 67944 : ssa_op_iter curri;
4049 : 67944 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4050 : 142536 : while (USE_FROM_PTR (curr) != loop_arg)
4051 : 6648 : curr = op_iter_next_use (&curri);
4052 : 67944 : curri.i = curri.numops;
4053 : 685278 : do
4054 : : {
4055 : 685278 : path.safe_push (std::make_pair (curri, curr));
4056 : 685278 : tree use = USE_FROM_PTR (curr);
4057 : 685278 : if (use == lookfor)
4058 : : break;
4059 : 617497 : gimple *def = SSA_NAME_DEF_STMT (use);
4060 : 617497 : if (gimple_nop_p (def)
4061 : 617497 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4062 : : {
4063 : 528292 : pop:
4064 : 528292 : do
4065 : : {
4066 : 528292 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4067 : 528292 : curri = x.first;
4068 : 528292 : curr = x.second;
4069 : 576207 : do
4070 : 576207 : curr = op_iter_next_use (&curri);
4071 : : /* Skip already visited or non-SSA operands (from iterating
4072 : : over PHI args). */
4073 : : while (curr != NULL_USE_OPERAND_P
4074 : 1152414 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4075 : 198034 : || ! bitmap_set_bit (visited,
4076 : 198034 : SSA_NAME_VERSION
4077 : : (USE_FROM_PTR (curr)))));
4078 : : }
4079 : 1056584 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4080 : 176153 : if (curr == NULL_USE_OPERAND_P)
4081 : : break;
4082 : : }
4083 : : else
4084 : : {
4085 : 514302 : if (gimple_code (def) == GIMPLE_PHI)
4086 : 50170 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4087 : : else
4088 : 464132 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4089 : : while (curr != NULL_USE_OPERAND_P
4090 : 613039 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4091 : 537380 : || ! bitmap_set_bit (visited,
4092 : 537380 : SSA_NAME_VERSION
4093 : : (USE_FROM_PTR (curr)))))
4094 : 98737 : curr = op_iter_next_use (&curri);
4095 : 514302 : if (curr == NULL_USE_OPERAND_P)
4096 : 72958 : goto pop;
4097 : : }
4098 : : }
4099 : : while (1);
4100 : 67944 : if (dump_file && (dump_flags & TDF_DETAILS))
4101 : : {
4102 : 3986 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4103 : 3986 : unsigned i;
4104 : 3986 : std::pair<ssa_op_iter, use_operand_p> *x;
4105 : 13602 : FOR_EACH_VEC_ELT (path, i, x)
4106 : 9616 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4107 : 3986 : dump_printf (MSG_NOTE, "\n");
4108 : : }
4109 : :
4110 : : /* Check whether the reduction path detected is valid. */
4111 : 67944 : bool fail = path.length () == 0;
4112 : 67944 : bool neg = false;
4113 : 67944 : int sign = -1;
4114 : 67944 : *code = ERROR_MARK;
4115 : 148363 : for (unsigned i = 1; i < path.length (); ++i)
4116 : : {
4117 : 83332 : gimple *use_stmt = USE_STMT (path[i].second);
4118 : 83332 : gimple_match_op op;
4119 : 83332 : if (!gimple_extract_op (use_stmt, &op))
4120 : : {
4121 : : fail = true;
4122 : 2913 : break;
4123 : : }
4124 : 82795 : unsigned int opi = op.num_ops;
4125 : 82795 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4126 : : {
4127 : : /* The following make sure we can compute the operand index
4128 : : easily plus it mostly disallows chaining via COND_EXPR condition
4129 : : operands. */
4130 : 130766 : for (opi = 0; opi < op.num_ops; ++opi)
4131 : 129831 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4132 : : break;
4133 : : }
4134 : 3080 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4135 : : {
4136 : 6155 : for (opi = 0; opi < op.num_ops; ++opi)
4137 : 6155 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4138 : : break;
4139 : : }
4140 : 82795 : if (opi == op.num_ops)
4141 : : {
4142 : : fail = true;
4143 : : break;
4144 : : }
4145 : 81860 : op.code = canonicalize_code (op.code, op.type);
4146 : 81860 : if (op.code == MINUS_EXPR)
4147 : : {
4148 : 5267 : op.code = PLUS_EXPR;
4149 : : /* Track whether we negate the reduction value each iteration. */
4150 : 5267 : if (op.ops[1] == op.ops[opi])
4151 : 36 : neg = ! neg;
4152 : : }
4153 : 76593 : else if (op.code == IFN_COND_SUB)
4154 : : {
4155 : 3 : op.code = IFN_COND_ADD;
4156 : : /* Track whether we negate the reduction value each iteration. */
4157 : 3 : if (op.ops[2] == op.ops[opi])
4158 : 0 : neg = ! neg;
4159 : : }
4160 : 81860 : if (CONVERT_EXPR_CODE_P (op.code)
4161 : 81860 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4162 : : ;
4163 : 77321 : else if (*code == ERROR_MARK)
4164 : : {
4165 : 66420 : *code = op.code;
4166 : 66420 : sign = TYPE_SIGN (op.type);
4167 : : }
4168 : 10901 : else if (op.code != *code)
4169 : : {
4170 : : fail = true;
4171 : : break;
4172 : : }
4173 : 9656 : else if ((op.code == MIN_EXPR
4174 : 9512 : || op.code == MAX_EXPR)
4175 : 9663 : && sign != TYPE_SIGN (op.type))
4176 : : {
4177 : : fail = true;
4178 : : break;
4179 : : }
4180 : : /* Check there's only a single stmt the op is used on. For the
4181 : : not value-changing tail and the last stmt allow out-of-loop uses.
4182 : : ??? We could relax this and handle arbitrary live stmts by
4183 : : forcing a scalar epilogue for example. */
4184 : 80612 : imm_use_iterator imm_iter;
4185 : 80612 : use_operand_p use_p;
4186 : 80612 : gimple *op_use_stmt;
4187 : 80612 : unsigned cnt = 0;
4188 : 83690 : bool cond_fn_p = op.code.is_internal_fn ()
4189 : 3078 : && (conditional_internal_fn_code (internal_fn (op.code))
4190 : 80612 : != ERROR_MARK);
4191 : :
4192 : 189443 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4193 : : {
4194 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
4195 : : have op1 twice (once as definition, once as else) in the same
4196 : : operation. Enforce this. */
4197 : 108831 : if (cond_fn_p && op_use_stmt == use_stmt)
4198 : : {
4199 : 3023 : gcall *call = as_a<gcall *> (use_stmt);
4200 : 3023 : unsigned else_pos
4201 : 3023 : = internal_fn_else_index (internal_fn (op.code));
4202 : 3023 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
4203 : : {
4204 : : fail = true;
4205 : : break;
4206 : : }
4207 : 15115 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4208 : : {
4209 : 12092 : if (j == else_pos)
4210 : 3023 : continue;
4211 : 9069 : if (gimple_call_arg (call, j) == op.ops[opi])
4212 : 3023 : cnt++;
4213 : : }
4214 : : }
4215 : 105808 : else if (!is_gimple_debug (op_use_stmt)
4216 : 105808 : && (*code != ERROR_MARK
4217 : 2403 : || flow_bb_inside_loop_p (loop,
4218 : 2403 : gimple_bb (op_use_stmt))))
4219 : 155709 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4220 : 77859 : cnt++;
4221 : 80612 : }
4222 : :
4223 : 80612 : if (cnt != 1)
4224 : : {
4225 : : fail = true;
4226 : : break;
4227 : : }
4228 : : }
4229 : 71029 : return ! fail && ! neg && *code != ERROR_MARK;
4230 : 67944 : }
4231 : :
4232 : : bool
4233 : 24 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4234 : : tree loop_arg, enum tree_code code)
4235 : : {
4236 : 24 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4237 : 24 : code_helper code_;
4238 : 24 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4239 : 24 : && code_ == code);
4240 : 24 : }
4241 : :
4242 : :
4243 : :
4244 : : /* Function vect_is_simple_reduction
4245 : :
4246 : : (1) Detect a cross-iteration def-use cycle that represents a simple
4247 : : reduction computation. We look for the following pattern:
4248 : :
4249 : : loop_header:
4250 : : a1 = phi < a0, a2 >
4251 : : a3 = ...
4252 : : a2 = operation (a3, a1)
4253 : :
4254 : : or
4255 : :
4256 : : a3 = ...
4257 : : loop_header:
4258 : : a1 = phi < a0, a2 >
4259 : : a2 = operation (a3, a1)
4260 : :
4261 : : such that:
4262 : : 1. operation is commutative and associative and it is safe to
4263 : : change the order of the computation
4264 : : 2. no uses for a2 in the loop (a2 is used out of the loop)
4265 : : 3. no uses of a1 in the loop besides the reduction operation
4266 : : 4. no uses of a1 outside the loop.
4267 : :
4268 : : Conditions 1,4 are tested here.
4269 : : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4270 : :
4271 : : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4272 : : nested cycles.
4273 : :
4274 : : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4275 : : reductions:
4276 : :
4277 : : a1 = phi < a0, a2 >
4278 : : inner loop (def of a3)
4279 : : a2 = phi < a3 >
4280 : :
4281 : : (4) Detect condition expressions, ie:
4282 : : for (int i = 0; i < N; i++)
4283 : : if (a[i] < val)
4284 : : ret_val = a[i];
4285 : :
4286 : : */
4287 : :
4288 : : static stmt_vec_info
4289 : 88472 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4290 : : bool *double_reduc, bool *reduc_chain_p, bool slp)
4291 : : {
4292 : 88472 : gphi *phi = as_a <gphi *> (phi_info->stmt);
4293 : 88472 : gimple *phi_use_stmt = NULL;
4294 : 88472 : imm_use_iterator imm_iter;
4295 : 88472 : use_operand_p use_p;
4296 : :
4297 : 88472 : *double_reduc = false;
4298 : 88472 : *reduc_chain_p = false;
4299 : 88472 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4300 : :
4301 : 88472 : tree phi_name = PHI_RESULT (phi);
4302 : : /* ??? If there are no uses of the PHI result the inner loop reduction
4303 : : won't be detected as possibly double-reduction by vectorizable_reduction
4304 : : because that tries to walk the PHI arg from the preheader edge which
4305 : : can be constant. See PR60382. */
4306 : 88472 : if (has_zero_uses (phi_name))
4307 : : return NULL;
4308 : 88376 : class loop *loop = (gimple_bb (phi))->loop_father;
4309 : 88376 : unsigned nphi_def_loop_uses = 0;
4310 : 220038 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4311 : : {
4312 : 135967 : gimple *use_stmt = USE_STMT (use_p);
4313 : 135967 : if (is_gimple_debug (use_stmt))
4314 : 34707 : continue;
4315 : :
4316 : 101260 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4317 : : {
4318 : 4305 : if (dump_enabled_p ())
4319 : 53 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4320 : : "intermediate value used outside loop.\n");
4321 : :
4322 : 4305 : return NULL;
4323 : : }
4324 : :
4325 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4326 : : op1 twice (once as definition, once as else) in the same operation.
4327 : : Only count it as one. */
4328 : 96955 : if (use_stmt != phi_use_stmt)
4329 : : {
4330 : 93571 : nphi_def_loop_uses++;
4331 : 93571 : phi_use_stmt = use_stmt;
4332 : : }
4333 : : }
4334 : :
4335 : 84071 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4336 : 84071 : if (TREE_CODE (latch_def) != SSA_NAME)
4337 : : {
4338 : 556 : if (dump_enabled_p ())
4339 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4340 : : "reduction: not ssa_name: %T\n", latch_def);
4341 : 556 : return NULL;
4342 : : }
4343 : :
4344 : 83515 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4345 : 83515 : if (!def_stmt_info
4346 : 83515 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4347 : 135 : return NULL;
4348 : :
4349 : 83380 : bool nested_in_vect_loop
4350 : 83380 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4351 : 83380 : unsigned nlatch_def_loop_uses = 0;
4352 : 83380 : auto_vec<gphi *, 3> lcphis;
4353 : 83380 : bool inner_loop_of_double_reduc = false;
4354 : 318070 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4355 : : {
4356 : 234690 : gimple *use_stmt = USE_STMT (use_p);
4357 : 234690 : if (is_gimple_debug (use_stmt))
4358 : 68146 : continue;
4359 : 166544 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4360 : 91787 : nlatch_def_loop_uses++;
4361 : : else
4362 : : {
4363 : : /* We can have more than one loop-closed PHI. */
4364 : 74757 : lcphis.safe_push (as_a <gphi *> (use_stmt));
4365 : 74757 : if (nested_in_vect_loop
4366 : 74757 : && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4367 : : == vect_double_reduction_def))
4368 : : inner_loop_of_double_reduc = true;
4369 : : }
4370 : : }
4371 : :
4372 : : /* If we are vectorizing an inner reduction we are executing that
4373 : : in the original order only in case we are not dealing with a
4374 : : double reduction. */
4375 : 83380 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4376 : : {
4377 : 2113 : if (dump_enabled_p ())
4378 : 368 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4379 : : "detected nested cycle: ");
4380 : 2113 : return def_stmt_info;
4381 : : }
4382 : :
4383 : : /* When the inner loop of a double reduction ends up with more than
4384 : : one loop-closed PHI we have failed to classify alternate such
4385 : : PHIs as double reduction, leading to wrong code. See PR103237. */
4386 : 81977 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
4387 : : {
4388 : 1 : if (dump_enabled_p ())
4389 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4390 : : "unhandle double reduction\n");
4391 : 1 : return NULL;
4392 : : }
4393 : :
4394 : : /* If this isn't a nested cycle or if the nested cycle reduction value
4395 : : is used ouside of the inner loop we cannot handle uses of the reduction
4396 : : value. */
4397 : 81266 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4398 : : {
4399 : 12452 : if (dump_enabled_p ())
4400 : 305 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4401 : : "reduction used in loop.\n");
4402 : 12452 : return NULL;
4403 : : }
4404 : :
4405 : : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4406 : : defined in the inner loop. */
4407 : 68814 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4408 : : {
4409 : 894 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
4410 : 894 : if (gimple_phi_num_args (def_stmt) != 1
4411 : 894 : || TREE_CODE (op1) != SSA_NAME)
4412 : : {
4413 : 52 : if (dump_enabled_p ())
4414 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4415 : : "unsupported phi node definition.\n");
4416 : :
4417 : 52 : return NULL;
4418 : : }
4419 : :
4420 : : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4421 : : and the latch definition op1. */
4422 : 842 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
4423 : 842 : if (gimple_bb (def1)
4424 : 842 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4425 : 842 : && loop->inner
4426 : 818 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4427 : 818 : && (is_gimple_assign (def1) || is_gimple_call (def1))
4428 : 809 : && is_a <gphi *> (phi_use_stmt)
4429 : 798 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4430 : 798 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4431 : : loop_latch_edge (loop->inner)))
4432 : 1638 : && lcphis.length () == 1)
4433 : : {
4434 : 719 : if (dump_enabled_p ())
4435 : 99 : report_vect_op (MSG_NOTE, def_stmt,
4436 : : "detected double reduction: ");
4437 : :
4438 : 719 : *double_reduc = true;
4439 : 719 : return def_stmt_info;
4440 : : }
4441 : :
4442 : 123 : return NULL;
4443 : : }
4444 : :
4445 : : /* Look for the expression computing latch_def from then loop PHI result. */
4446 : 67920 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4447 : 67920 : code_helper code;
4448 : 67920 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4449 : : path))
4450 : : {
4451 : 64835 : STMT_VINFO_REDUC_CODE (phi_info) = code;
4452 : 64835 : if (code == COND_EXPR && !nested_in_vect_loop)
4453 : 3384 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4454 : :
4455 : : /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4456 : : reduction chain for which the additional restriction is that
4457 : : all operations in the chain are the same. */
4458 : 64835 : auto_vec<stmt_vec_info, 8> reduc_chain;
4459 : 64835 : unsigned i;
4460 : 64835 : bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4461 : 208495 : for (i = path.length () - 1; i >= 1; --i)
4462 : : {
4463 : 78825 : gimple *stmt = USE_STMT (path[i].second);
4464 : 78825 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4465 : 78825 : gimple_match_op op;
4466 : 78825 : if (!gimple_extract_op (stmt, &op))
4467 : 0 : gcc_unreachable ();
4468 : 78825 : if (gassign *assign = dyn_cast<gassign *> (stmt))
4469 : 75764 : STMT_VINFO_REDUC_IDX (stmt_info)
4470 : 75764 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4471 : : else
4472 : : {
4473 : 3061 : gcall *call = as_a<gcall *> (stmt);
4474 : 3061 : STMT_VINFO_REDUC_IDX (stmt_info)
4475 : 3061 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
4476 : : }
4477 : 78825 : bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4478 : 78825 : && (i == 1 || i == path.length () - 1));
4479 : 9625 : if ((op.code != code && !leading_conversion)
4480 : : /* We can only handle the final value in epilogue
4481 : : generation for reduction chains. */
4482 : 83268 : || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4483 : : is_slp_reduc = false;
4484 : : /* For reduction chains we support a trailing/leading
4485 : : conversions. We do not store those in the actual chain. */
4486 : 78825 : if (leading_conversion)
4487 : 4443 : continue;
4488 : 74382 : reduc_chain.safe_push (stmt_info);
4489 : : }
4490 : 120397 : if (slp && is_slp_reduc && reduc_chain.length () > 1)
4491 : : {
4492 : 3581 : for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4493 : : {
4494 : 2782 : REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4495 : 2782 : REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4496 : : }
4497 : 799 : REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4498 : 799 : REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4499 : :
4500 : : /* Save the chain for further analysis in SLP detection. */
4501 : 799 : LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4502 : 1598 : REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4503 : :
4504 : 799 : *reduc_chain_p = true;
4505 : 799 : if (dump_enabled_p ())
4506 : 298 : dump_printf_loc (MSG_NOTE, vect_location,
4507 : : "reduction: detected reduction chain\n");
4508 : : }
4509 : 64036 : else if (dump_enabled_p ())
4510 : 3602 : dump_printf_loc (MSG_NOTE, vect_location,
4511 : : "reduction: detected reduction\n");
4512 : :
4513 : 64835 : return def_stmt_info;
4514 : 64835 : }
4515 : :
4516 : 3085 : if (dump_enabled_p ())
4517 : 89 : dump_printf_loc (MSG_NOTE, vect_location,
4518 : : "reduction: unknown pattern\n");
4519 : :
4520 : : return NULL;
4521 : 151300 : }
4522 : :
4523 : : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4524 : : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4525 : : or -1 if not known. */
4526 : :
4527 : : static int
4528 : 326727 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4529 : : {
4530 : 326727 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
4531 : 326727 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4532 : : {
4533 : 130895 : if (dump_enabled_p ())
4534 : 2710 : dump_printf_loc (MSG_NOTE, vect_location,
4535 : : "cost model: epilogue peel iters set to vf/2 "
4536 : : "because loop iterations are unknown .\n");
4537 : 130895 : return assumed_vf / 2;
4538 : : }
4539 : : else
4540 : : {
4541 : 195832 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4542 : 195832 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
4543 : 195832 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4544 : : /* If we need to peel for gaps, but no peeling is required, we have to
4545 : : peel VF iterations. */
4546 : 195832 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4547 : 195832 : peel_iters_epilogue = assumed_vf;
4548 : 195832 : return peel_iters_epilogue;
4549 : : }
4550 : : }
4551 : :
4552 : : /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4553 : : int
4554 : 251819 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4555 : : int *peel_iters_epilogue,
4556 : : stmt_vector_for_cost *scalar_cost_vec,
4557 : : stmt_vector_for_cost *prologue_cost_vec,
4558 : : stmt_vector_for_cost *epilogue_cost_vec)
4559 : : {
4560 : 251819 : int retval = 0;
4561 : :
4562 : 251819 : *peel_iters_epilogue
4563 : 251819 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4564 : :
4565 : 251819 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4566 : : {
4567 : : /* If peeled iterations are known but number of scalar loop
4568 : : iterations are unknown, count a taken branch per peeled loop. */
4569 : 86738 : if (peel_iters_prologue > 0)
4570 : 49225 : retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4571 : : vect_prologue);
4572 : 86738 : if (*peel_iters_epilogue > 0)
4573 : 86658 : retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4574 : : vect_epilogue);
4575 : : }
4576 : :
4577 : 251819 : stmt_info_for_cost *si;
4578 : 251819 : int j;
4579 : 251819 : if (peel_iters_prologue)
4580 : 591081 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4581 : 489429 : retval += record_stmt_cost (prologue_cost_vec,
4582 : 489429 : si->count * peel_iters_prologue,
4583 : : si->kind, si->stmt_info, si->misalign,
4584 : : vect_prologue);
4585 : 251819 : if (*peel_iters_epilogue)
4586 : 973590 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4587 : 805476 : retval += record_stmt_cost (epilogue_cost_vec,
4588 : 805476 : si->count * *peel_iters_epilogue,
4589 : : si->kind, si->stmt_info, si->misalign,
4590 : : vect_epilogue);
4591 : :
4592 : 251819 : return retval;
4593 : : }
4594 : :
4595 : : /* Function vect_estimate_min_profitable_iters
4596 : :
4597 : : Return the number of iterations required for the vector version of the
4598 : : loop to be profitable relative to the cost of the scalar version of the
4599 : : loop.
4600 : :
4601 : : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4602 : : of iterations for vectorization. -1 value means loop vectorization
4603 : : is not profitable. This returned value may be used for dynamic
4604 : : profitability check.
4605 : :
4606 : : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4607 : : for static check against estimated number of iterations. */
4608 : :
4609 : : static void
4610 : 90791 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4611 : : int *ret_min_profitable_niters,
4612 : : int *ret_min_profitable_estimate,
4613 : : unsigned *suggested_unroll_factor)
4614 : : {
4615 : 90791 : int min_profitable_iters;
4616 : 90791 : int min_profitable_estimate;
4617 : 90791 : int peel_iters_prologue;
4618 : 90791 : int peel_iters_epilogue;
4619 : 90791 : unsigned vec_inside_cost = 0;
4620 : 90791 : int vec_outside_cost = 0;
4621 : 90791 : unsigned vec_prologue_cost = 0;
4622 : 90791 : unsigned vec_epilogue_cost = 0;
4623 : 90791 : int scalar_single_iter_cost = 0;
4624 : 90791 : int scalar_outside_cost = 0;
4625 : 90791 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
4626 : 90791 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4627 : 90791 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
4628 : :
4629 : : /* Cost model disabled. */
4630 : 90791 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4631 : : {
4632 : 15764 : if (dump_enabled_p ())
4633 : 9566 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4634 : 15764 : *ret_min_profitable_niters = 0;
4635 : 15764 : *ret_min_profitable_estimate = 0;
4636 : 15764 : return;
4637 : : }
4638 : :
4639 : : /* Requires loop versioning tests to handle misalignment. */
4640 : 75027 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4641 : : {
4642 : : /* FIXME: Make cost depend on complexity of individual check. */
4643 : 16 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4644 : 16 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4645 : 16 : if (dump_enabled_p ())
4646 : 0 : dump_printf (MSG_NOTE,
4647 : : "cost model: Adding cost of checks for loop "
4648 : : "versioning to treat misalignment.\n");
4649 : : }
4650 : :
4651 : : /* Requires loop versioning with alias checks. */
4652 : 75027 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4653 : : {
4654 : : /* FIXME: Make cost depend on complexity of individual check. */
4655 : 3874 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4656 : 3874 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4657 : 3874 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4658 : 0 : if (len)
4659 : : /* Count LEN - 1 ANDs and LEN comparisons. */
4660 : 0 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4661 : : scalar_stmt, vect_prologue);
4662 : 3874 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4663 : 1095 : if (len)
4664 : : {
4665 : : /* Count LEN - 1 ANDs and LEN comparisons. */
4666 : 1095 : unsigned int nstmts = len * 2 - 1;
4667 : : /* +1 for each bias that needs adding. */
4668 : 2190 : for (unsigned int i = 0; i < len; ++i)
4669 : 1095 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4670 : 107 : nstmts += 1;
4671 : 1095 : (void) add_stmt_cost (target_cost_data, nstmts,
4672 : : scalar_stmt, vect_prologue);
4673 : : }
4674 : 3874 : if (dump_enabled_p ())
4675 : 14 : dump_printf (MSG_NOTE,
4676 : : "cost model: Adding cost of checks for loop "
4677 : : "versioning aliasing.\n");
4678 : : }
4679 : :
4680 : : /* Requires loop versioning with niter checks. */
4681 : 75027 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4682 : : {
4683 : : /* FIXME: Make cost depend on complexity of individual check. */
4684 : 667 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4685 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4686 : 667 : if (dump_enabled_p ())
4687 : 1 : dump_printf (MSG_NOTE,
4688 : : "cost model: Adding cost of checks for loop "
4689 : : "versioning niters.\n");
4690 : : }
4691 : :
4692 : 75027 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4693 : 4541 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4694 : : vect_prologue);
4695 : :
4696 : : /* Count statements in scalar loop. Using this as scalar cost for a single
4697 : : iteration for now.
4698 : :
4699 : : TODO: Add outer loop support.
4700 : :
4701 : : TODO: Consider assigning different costs to different scalar
4702 : : statements. */
4703 : :
4704 : 75027 : scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4705 : :
4706 : : /* Add additional cost for the peeled instructions in prologue and epilogue
4707 : : loop. (For fully-masked loops there will be no peeling.)
4708 : :
4709 : : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4710 : : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4711 : :
4712 : : TODO: Build an expression that represents peel_iters for prologue and
4713 : : epilogue to be used in a run-time test. */
4714 : :
4715 : 75027 : bool prologue_need_br_taken_cost = false;
4716 : 75027 : bool prologue_need_br_not_taken_cost = false;
4717 : :
4718 : : /* Calculate peel_iters_prologue. */
4719 : 75027 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4720 : : peel_iters_prologue = 0;
4721 : 75027 : else if (npeel < 0)
4722 : : {
4723 : 106 : peel_iters_prologue = assumed_vf / 2;
4724 : 106 : if (dump_enabled_p ())
4725 : 5 : dump_printf (MSG_NOTE, "cost model: "
4726 : : "prologue peel iters set to vf/2.\n");
4727 : :
4728 : : /* If peeled iterations are unknown, count a taken branch and a not taken
4729 : : branch per peeled loop. Even if scalar loop iterations are known,
4730 : : vector iterations are not known since peeled prologue iterations are
4731 : : not known. Hence guards remain the same. */
4732 : : prologue_need_br_taken_cost = true;
4733 : : prologue_need_br_not_taken_cost = true;
4734 : : }
4735 : : else
4736 : : {
4737 : 74921 : peel_iters_prologue = npeel;
4738 : 74921 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4739 : : /* If peeled iterations are known but number of scalar loop
4740 : : iterations are unknown, count a taken branch per peeled loop. */
4741 : 75027 : prologue_need_br_taken_cost = true;
4742 : : }
4743 : :
4744 : 75027 : bool epilogue_need_br_taken_cost = false;
4745 : 75027 : bool epilogue_need_br_not_taken_cost = false;
4746 : :
4747 : : /* Calculate peel_iters_epilogue. */
4748 : 75027 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4749 : : /* We need to peel exactly one iteration for gaps. */
4750 : 13 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4751 : 75014 : else if (npeel < 0)
4752 : : {
4753 : : /* If peeling for alignment is unknown, loop bound of main loop
4754 : : becomes unknown. */
4755 : 106 : peel_iters_epilogue = assumed_vf / 2;
4756 : 106 : if (dump_enabled_p ())
4757 : 5 : dump_printf (MSG_NOTE, "cost model: "
4758 : : "epilogue peel iters set to vf/2 because "
4759 : : "peeling for alignment is unknown.\n");
4760 : :
4761 : : /* See the same reason above in peel_iters_prologue calculation. */
4762 : : epilogue_need_br_taken_cost = true;
4763 : : epilogue_need_br_not_taken_cost = true;
4764 : : }
4765 : : else
4766 : : {
4767 : 74908 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4768 : 74908 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4769 : : /* If peeled iterations are known but number of scalar loop
4770 : : iterations are unknown, count a taken branch per peeled loop. */
4771 : 75027 : epilogue_need_br_taken_cost = true;
4772 : : }
4773 : :
4774 : 75027 : stmt_info_for_cost *si;
4775 : 75027 : int j;
4776 : : /* Add costs associated with peel_iters_prologue. */
4777 : 75027 : if (peel_iters_prologue)
4778 : 480 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4779 : : {
4780 : 367 : (void) add_stmt_cost (target_cost_data,
4781 : 367 : si->count * peel_iters_prologue, si->kind,
4782 : : si->stmt_info, si->node, si->vectype,
4783 : : si->misalign, vect_prologue);
4784 : : }
4785 : :
4786 : : /* Add costs associated with peel_iters_epilogue. */
4787 : 75027 : if (peel_iters_epilogue)
4788 : 258284 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4789 : : {
4790 : 205868 : (void) add_stmt_cost (target_cost_data,
4791 : 205868 : si->count * peel_iters_epilogue, si->kind,
4792 : : si->stmt_info, si->node, si->vectype,
4793 : : si->misalign, vect_epilogue);
4794 : : }
4795 : :
4796 : : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4797 : :
4798 : 75027 : if (prologue_need_br_taken_cost)
4799 : 107 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4800 : : vect_prologue);
4801 : :
4802 : 75027 : if (prologue_need_br_not_taken_cost)
4803 : 106 : (void) add_stmt_cost (target_cost_data, 1,
4804 : : cond_branch_not_taken, vect_prologue);
4805 : :
4806 : 75027 : if (epilogue_need_br_taken_cost)
4807 : 43739 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4808 : : vect_epilogue);
4809 : :
4810 : 75027 : if (epilogue_need_br_not_taken_cost)
4811 : 106 : (void) add_stmt_cost (target_cost_data, 1,
4812 : : cond_branch_not_taken, vect_epilogue);
4813 : :
4814 : : /* Take care of special costs for rgroup controls of partial vectors. */
4815 : 13 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4816 : 75040 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4817 : : == vect_partial_vectors_avx512))
4818 : : {
4819 : : /* Calculate how many masks we need to generate. */
4820 : 13 : unsigned int num_masks = 0;
4821 : 13 : bool need_saturation = false;
4822 : 54 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4823 : 15 : if (rgm.type)
4824 : : {
4825 : 13 : unsigned nvectors = rgm.factor;
4826 : 13 : num_masks += nvectors;
4827 : 13 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4828 : 13 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4829 : 2 : need_saturation = true;
4830 : : }
4831 : :
4832 : : /* ??? The target isn't able to identify the costs below as
4833 : : producing masks so it cannot penaltize cases where we'd run
4834 : : out of mask registers for example. */
4835 : :
4836 : : /* ??? We are also failing to account for smaller vector masks
4837 : : we generate by splitting larger masks in vect_get_loop_mask. */
4838 : :
4839 : : /* In the worst case, we need to generate each mask in the prologue
4840 : : and in the loop body. We need one splat per group and one
4841 : : compare per mask.
4842 : :
4843 : : Sometimes the prologue mask will fold to a constant,
4844 : : so the actual prologue cost might be smaller. However, it's
4845 : : simpler and safer to use the worst-case cost; if this ends up
4846 : : being the tie-breaker between vectorizing or not, then it's
4847 : : probably better not to vectorize. */
4848 : 13 : (void) add_stmt_cost (target_cost_data,
4849 : : num_masks
4850 : 13 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4851 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4852 : : vect_prologue);
4853 : 26 : (void) add_stmt_cost (target_cost_data,
4854 : : num_masks
4855 : 26 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4856 : : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4857 : :
4858 : : /* When we need saturation we need it both in the prologue and
4859 : : the epilogue. */
4860 : 13 : if (need_saturation)
4861 : : {
4862 : 2 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4863 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4864 : 2 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4865 : : NULL, NULL, NULL_TREE, 0, vect_body);
4866 : : }
4867 : : }
4868 : 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4869 : 75014 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4870 : : == vect_partial_vectors_while_ult))
4871 : : {
4872 : : /* Calculate how many masks we need to generate. */
4873 : : unsigned int num_masks = 0;
4874 : : rgroup_controls *rgm;
4875 : : unsigned int num_vectors_m1;
4876 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4877 : : num_vectors_m1, rgm)
4878 : 0 : if (rgm->type)
4879 : 0 : num_masks += num_vectors_m1 + 1;
4880 : 0 : gcc_assert (num_masks > 0);
4881 : :
4882 : : /* In the worst case, we need to generate each mask in the prologue
4883 : : and in the loop body. One of the loop body mask instructions
4884 : : replaces the comparison in the scalar loop, and since we don't
4885 : : count the scalar comparison against the scalar body, we shouldn't
4886 : : count that vector instruction against the vector body either.
4887 : :
4888 : : Sometimes we can use unpacks instead of generating prologue
4889 : : masks and sometimes the prologue mask will fold to a constant,
4890 : : so the actual prologue cost might be smaller. However, it's
4891 : : simpler and safer to use the worst-case cost; if this ends up
4892 : : being the tie-breaker between vectorizing or not, then it's
4893 : : probably better not to vectorize. */
4894 : 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4895 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4896 : : vect_prologue);
4897 : 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4898 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4899 : : vect_body);
4900 : : }
4901 : 75014 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4902 : : {
4903 : : /* Referring to the functions vect_set_loop_condition_partial_vectors
4904 : : and vect_set_loop_controls_directly, we need to generate each
4905 : : length in the prologue and in the loop body if required. Although
4906 : : there are some possible optimizations, we consider the worst case
4907 : : here. */
4908 : :
4909 : 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4910 : 0 : signed char partial_load_store_bias
4911 : : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4912 : 0 : bool need_iterate_p
4913 : 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4914 : 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4915 : :
4916 : : /* Calculate how many statements to be added. */
4917 : 0 : unsigned int prologue_stmts = 0;
4918 : 0 : unsigned int body_stmts = 0;
4919 : :
4920 : 0 : rgroup_controls *rgc;
4921 : 0 : unsigned int num_vectors_m1;
4922 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4923 : 0 : if (rgc->type)
4924 : : {
4925 : : /* May need one SHIFT for nitems_total computation. */
4926 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4927 : 0 : if (nitems != 1 && !niters_known_p)
4928 : 0 : prologue_stmts += 1;
4929 : :
4930 : : /* May need one MAX and one MINUS for wrap around. */
4931 : 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4932 : 0 : prologue_stmts += 2;
4933 : :
4934 : : /* Need one MAX and one MINUS for each batch limit excepting for
4935 : : the 1st one. */
4936 : 0 : prologue_stmts += num_vectors_m1 * 2;
4937 : :
4938 : 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4939 : :
4940 : : /* Need to set up lengths in prologue, only one MIN required
4941 : : for each since start index is zero. */
4942 : 0 : prologue_stmts += num_vectors;
4943 : :
4944 : : /* If we have a non-zero partial load bias, we need one PLUS
4945 : : to adjust the load length. */
4946 : 0 : if (partial_load_store_bias != 0)
4947 : 0 : body_stmts += 1;
4948 : :
4949 : 0 : unsigned int length_update_cost = 0;
4950 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4951 : : /* For decrement IV style, Each only need a single SELECT_VL
4952 : : or MIN since beginning to calculate the number of elements
4953 : : need to be processed in current iteration. */
4954 : : length_update_cost = 1;
4955 : : else
4956 : : /* For increment IV stype, Each may need two MINs and one MINUS to
4957 : : update lengths in body for next iteration. */
4958 : 0 : length_update_cost = 3;
4959 : :
4960 : 0 : if (need_iterate_p)
4961 : 0 : body_stmts += length_update_cost * num_vectors;
4962 : : }
4963 : :
4964 : 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4965 : : scalar_stmt, vect_prologue);
4966 : 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4967 : : scalar_stmt, vect_body);
4968 : : }
4969 : :
4970 : : /* FORNOW: The scalar outside cost is incremented in one of the
4971 : : following ways:
4972 : :
4973 : : 1. The vectorizer checks for alignment and aliasing and generates
4974 : : a condition that allows dynamic vectorization. A cost model
4975 : : check is ANDED with the versioning condition. Hence scalar code
4976 : : path now has the added cost of the versioning check.
4977 : :
4978 : : if (cost > th & versioning_check)
4979 : : jmp to vector code
4980 : :
4981 : : Hence run-time scalar is incremented by not-taken branch cost.
4982 : :
4983 : : 2. The vectorizer then checks if a prologue is required. If the
4984 : : cost model check was not done before during versioning, it has to
4985 : : be done before the prologue check.
4986 : :
4987 : : if (cost <= th)
4988 : : prologue = scalar_iters
4989 : : if (prologue == 0)
4990 : : jmp to vector code
4991 : : else
4992 : : execute prologue
4993 : : if (prologue == num_iters)
4994 : : go to exit
4995 : :
4996 : : Hence the run-time scalar cost is incremented by a taken branch,
4997 : : plus a not-taken branch, plus a taken branch cost.
4998 : :
4999 : : 3. The vectorizer then checks if an epilogue is required. If the
5000 : : cost model check was not done before during prologue check, it
5001 : : has to be done with the epilogue check.
5002 : :
5003 : : if (prologue == 0)
5004 : : jmp to vector code
5005 : : else
5006 : : execute prologue
5007 : : if (prologue == num_iters)
5008 : : go to exit
5009 : : vector code:
5010 : : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
5011 : : jmp to epilogue
5012 : :
5013 : : Hence the run-time scalar cost should be incremented by 2 taken
5014 : : branches.
5015 : :
5016 : : TODO: The back end may reorder the BBS's differently and reverse
5017 : : conditions/branch directions. Change the estimates below to
5018 : : something more reasonable. */
5019 : :
5020 : : /* If the number of iterations is known and we do not do versioning, we can
5021 : : decide whether to vectorize at compile time. Hence the scalar version
5022 : : do not carry cost model guard costs. */
5023 : 30767 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5024 : 105794 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
5025 : : {
5026 : : /* Cost model check occurs at versioning. */
5027 : 44807 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
5028 : 4541 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
5029 : : else
5030 : : {
5031 : : /* Cost model check occurs at prologue generation. */
5032 : 40266 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
5033 : 28 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
5034 : 28 : + vect_get_stmt_cost (cond_branch_not_taken);
5035 : : /* Cost model check occurs at epilogue generation. */
5036 : : else
5037 : 40238 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
5038 : : }
5039 : : }
5040 : :
5041 : : /* Complete the target-specific cost calculations. */
5042 : 75027 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
5043 : 75027 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
5044 : 75027 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
5045 : 75027 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
5046 : 75027 : if (suggested_unroll_factor)
5047 : 75027 : *suggested_unroll_factor
5048 : 75027 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
5049 : :
5050 : 75027 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
5051 : 0 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5052 : 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5053 : : *suggested_unroll_factor,
5054 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5055 : : {
5056 : 0 : if (dump_enabled_p ())
5057 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5058 : : "can't unroll as unrolled vectorization factor larger"
5059 : : " than maximum vectorization factor: "
5060 : : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5061 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5062 : 0 : *suggested_unroll_factor = 1;
5063 : : }
5064 : :
5065 : 75027 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5066 : :
5067 : 75027 : if (dump_enabled_p ())
5068 : : {
5069 : 567 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5070 : 567 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
5071 : : vec_inside_cost);
5072 : 567 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
5073 : : vec_prologue_cost);
5074 : 567 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
5075 : : vec_epilogue_cost);
5076 : 567 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
5077 : : scalar_single_iter_cost);
5078 : 567 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
5079 : : scalar_outside_cost);
5080 : 567 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
5081 : : vec_outside_cost);
5082 : 567 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
5083 : : peel_iters_prologue);
5084 : 567 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
5085 : : peel_iters_epilogue);
5086 : : }
5087 : :
5088 : : /* Calculate number of iterations required to make the vector version
5089 : : profitable, relative to the loop bodies only. The following condition
5090 : : must hold true:
5091 : : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5092 : : where
5093 : : SIC = scalar iteration cost, VIC = vector iteration cost,
5094 : : VOC = vector outside cost, VF = vectorization factor,
5095 : : NPEEL = prologue iterations + epilogue iterations,
5096 : : SOC = scalar outside cost for run time cost model check. */
5097 : :
5098 : 75027 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5099 : 75027 : - vec_inside_cost);
5100 : 75027 : if (saving_per_viter <= 0)
5101 : : {
5102 : 23792 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5103 : 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5104 : : "vectorization did not happen for a simd loop");
5105 : :
5106 : 23792 : if (dump_enabled_p ())
5107 : 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5108 : : "cost model: the vector iteration cost = %d "
5109 : : "divided by the scalar iteration cost = %d "
5110 : : "is greater or equal to the vectorization factor = %d"
5111 : : ".\n",
5112 : : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5113 : 23792 : *ret_min_profitable_niters = -1;
5114 : 23792 : *ret_min_profitable_estimate = -1;
5115 : 23792 : return;
5116 : : }
5117 : :
5118 : : /* ??? The "if" arm is written to handle all cases; see below for what
5119 : : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5120 : 51235 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5121 : : {
5122 : : /* Rewriting the condition above in terms of the number of
5123 : : vector iterations (vniters) rather than the number of
5124 : : scalar iterations (niters) gives:
5125 : :
5126 : : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5127 : :
5128 : : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5129 : :
5130 : : For integer N, X and Y when X > 0:
5131 : :
5132 : : N * X > Y <==> N >= (Y /[floor] X) + 1. */
5133 : 7 : int outside_overhead = (vec_outside_cost
5134 : 7 : - scalar_single_iter_cost * peel_iters_prologue
5135 : 7 : - scalar_single_iter_cost * peel_iters_epilogue
5136 : : - scalar_outside_cost);
5137 : : /* We're only interested in cases that require at least one
5138 : : vector iteration. */
5139 : 7 : int min_vec_niters = 1;
5140 : 7 : if (outside_overhead > 0)
5141 : 6 : min_vec_niters = outside_overhead / saving_per_viter + 1;
5142 : :
5143 : 7 : if (dump_enabled_p ())
5144 : 0 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5145 : : min_vec_niters);
5146 : :
5147 : 7 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5148 : : {
5149 : : /* Now that we know the minimum number of vector iterations,
5150 : : find the minimum niters for which the scalar cost is larger:
5151 : :
5152 : : SIC * niters > VIC * vniters + VOC - SOC
5153 : :
5154 : : We know that the minimum niters is no more than
5155 : : vniters * VF + NPEEL, but it might be (and often is) less
5156 : : than that if a partial vector iteration is cheaper than the
5157 : : equivalent scalar code. */
5158 : 7 : int threshold = (vec_inside_cost * min_vec_niters
5159 : 7 : + vec_outside_cost
5160 : 7 : - scalar_outside_cost);
5161 : 7 : if (threshold <= 0)
5162 : : min_profitable_iters = 1;
5163 : : else
5164 : 7 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5165 : : }
5166 : : else
5167 : : /* Convert the number of vector iterations into a number of
5168 : : scalar iterations. */
5169 : 0 : min_profitable_iters = (min_vec_niters * assumed_vf
5170 : 0 : + peel_iters_prologue
5171 : : + peel_iters_epilogue);
5172 : : }
5173 : : else
5174 : : {
5175 : 51228 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5176 : 51228 : * assumed_vf
5177 : 51228 : - vec_inside_cost * peel_iters_prologue
5178 : 51228 : - vec_inside_cost * peel_iters_epilogue);
5179 : 51228 : if (min_profitable_iters <= 0)
5180 : : min_profitable_iters = 0;
5181 : : else
5182 : : {
5183 : 43281 : min_profitable_iters /= saving_per_viter;
5184 : :
5185 : 43281 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5186 : 43281 : <= (((int) vec_inside_cost * min_profitable_iters)
5187 : 43281 : + (((int) vec_outside_cost - scalar_outside_cost)
5188 : : * assumed_vf)))
5189 : 43281 : min_profitable_iters++;
5190 : : }
5191 : : }
5192 : :
5193 : 51235 : if (dump_enabled_p ())
5194 : 551 : dump_printf (MSG_NOTE,
5195 : : " Calculated minimum iters for profitability: %d\n",
5196 : : min_profitable_iters);
5197 : :
5198 : 51235 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5199 : 51228 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5200 : : /* We want the vectorized loop to execute at least once. */
5201 : : min_profitable_iters = assumed_vf + peel_iters_prologue;
5202 : 8759 : else if (min_profitable_iters < peel_iters_prologue)
5203 : : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5204 : : vectorized loop executes at least once. */
5205 : : min_profitable_iters = peel_iters_prologue;
5206 : :
5207 : 51235 : if (dump_enabled_p ())
5208 : 551 : dump_printf_loc (MSG_NOTE, vect_location,
5209 : : " Runtime profitability threshold = %d\n",
5210 : : min_profitable_iters);
5211 : :
5212 : 51235 : *ret_min_profitable_niters = min_profitable_iters;
5213 : :
5214 : : /* Calculate number of iterations required to make the vector version
5215 : : profitable, relative to the loop bodies only.
5216 : :
5217 : : Non-vectorized variant is SIC * niters and it must win over vector
5218 : : variant on the expected loop trip count. The following condition must hold true:
5219 : : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5220 : :
5221 : 51235 : if (vec_outside_cost <= 0)
5222 : : min_profitable_estimate = 0;
5223 : : /* ??? This "else if" arm is written to handle all cases; see below for
5224 : : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5225 : 46186 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5226 : : {
5227 : : /* This is a repeat of the code above, but with + SOC rather
5228 : : than - SOC. */
5229 : 7 : int outside_overhead = (vec_outside_cost
5230 : 7 : - scalar_single_iter_cost * peel_iters_prologue
5231 : 7 : - scalar_single_iter_cost * peel_iters_epilogue
5232 : : + scalar_outside_cost);
5233 : 7 : int min_vec_niters = 1;
5234 : 7 : if (outside_overhead > 0)
5235 : 7 : min_vec_niters = outside_overhead / saving_per_viter + 1;
5236 : :
5237 : 7 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5238 : : {
5239 : 7 : int threshold = (vec_inside_cost * min_vec_niters
5240 : 7 : + vec_outside_cost
5241 : 7 : + scalar_outside_cost);
5242 : 7 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5243 : : }
5244 : : else
5245 : : min_profitable_estimate = (min_vec_niters * assumed_vf
5246 : : + peel_iters_prologue
5247 : : + peel_iters_epilogue);
5248 : : }
5249 : : else
5250 : : {
5251 : 46179 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5252 : 46179 : * assumed_vf
5253 : 46179 : - vec_inside_cost * peel_iters_prologue
5254 : 46179 : - vec_inside_cost * peel_iters_epilogue)
5255 : 46179 : / ((scalar_single_iter_cost * assumed_vf)
5256 : : - vec_inside_cost);
5257 : : }
5258 : 51235 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5259 : 51235 : if (dump_enabled_p ())
5260 : 551 : dump_printf_loc (MSG_NOTE, vect_location,
5261 : : " Static estimate profitability threshold = %d\n",
5262 : : min_profitable_estimate);
5263 : :
5264 : 51235 : *ret_min_profitable_estimate = min_profitable_estimate;
5265 : : }
5266 : :
5267 : : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5268 : : vector elements (not bits) for a vector with NELT elements. */
5269 : : static void
5270 : 2003 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5271 : : vec_perm_builder *sel)
5272 : : {
5273 : : /* The encoding is a single stepped pattern. Any wrap-around is handled
5274 : : by vec_perm_indices. */
5275 : 2003 : sel->new_vector (nelt, 1, 3);
5276 : 8012 : for (unsigned int i = 0; i < 3; i++)
5277 : 6009 : sel->quick_push (i + offset);
5278 : 2003 : }
5279 : :
5280 : : /* Checks whether the target supports whole-vector shifts for vectors of mode
5281 : : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5282 : : it supports vec_perm_const with masks for all necessary shift amounts. */
5283 : : static bool
5284 : 6315 : have_whole_vector_shift (machine_mode mode)
5285 : : {
5286 : 6315 : if (can_implement_p (vec_shr_optab, mode))
5287 : : return true;
5288 : :
5289 : : /* Variable-length vectors should be handled via the optab. */
5290 : 57 : unsigned int nelt;
5291 : 114 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5292 : : return false;
5293 : :
5294 : 57 : vec_perm_builder sel;
5295 : 57 : vec_perm_indices indices;
5296 : 290 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5297 : : {
5298 : 233 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
5299 : 233 : indices.new_vector (sel, 2, nelt);
5300 : 233 : if (!can_vec_perm_const_p (mode, mode, indices, false))
5301 : : return false;
5302 : : }
5303 : : return true;
5304 : 57 : }
5305 : :
5306 : : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5307 : : multiplication operands have differing signs and (b) we intend
5308 : : to emulate the operation using a series of signed DOT_PROD_EXPRs.
5309 : : See vect_emulate_mixed_dot_prod for the actual sequence used. */
5310 : :
5311 : : static bool
5312 : 1704 : vect_is_emulated_mixed_dot_prod (stmt_vec_info stmt_info)
5313 : : {
5314 : 1704 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5315 : 1466 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5316 : : return false;
5317 : :
5318 : 528 : tree rhs1 = gimple_assign_rhs1 (assign);
5319 : 528 : tree rhs2 = gimple_assign_rhs2 (assign);
5320 : 528 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5321 : : return false;
5322 : :
5323 : 120 : gcc_assert (STMT_VINFO_REDUC_VECTYPE_IN (stmt_info));
5324 : 120 : return !directly_supported_p (DOT_PROD_EXPR,
5325 : : STMT_VINFO_VECTYPE (stmt_info),
5326 : : STMT_VINFO_REDUC_VECTYPE_IN (stmt_info),
5327 : 120 : optab_vector_mixed_sign);
5328 : : }
5329 : :
5330 : : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5331 : : functions. Design better to avoid maintenance issues. */
5332 : :
5333 : : /* Function vect_model_reduction_cost.
5334 : :
5335 : : Models cost for a reduction operation, including the vector ops
5336 : : generated within the strip-mine loop in some cases, the initial
5337 : : definition before the loop, and the epilogue code that must be generated. */
5338 : :
5339 : : static void
5340 : 48777 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
5341 : : stmt_vec_info stmt_info, internal_fn reduc_fn,
5342 : : vect_reduction_type reduction_type,
5343 : : int ncopies, stmt_vector_for_cost *cost_vec)
5344 : : {
5345 : 48777 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5346 : 48777 : tree vectype;
5347 : 48777 : machine_mode mode;
5348 : 48777 : class loop *loop = NULL;
5349 : :
5350 : 48777 : if (loop_vinfo)
5351 : 48777 : loop = LOOP_VINFO_LOOP (loop_vinfo);
5352 : :
5353 : : /* Condition reductions generate two reductions in the loop. */
5354 : 48777 : if (reduction_type == COND_REDUCTION)
5355 : 224 : ncopies *= 2;
5356 : :
5357 : 48777 : vectype = STMT_VINFO_VECTYPE (stmt_info);
5358 : 48777 : mode = TYPE_MODE (vectype);
5359 : 48777 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5360 : :
5361 : 48777 : gimple_match_op op;
5362 : 48777 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5363 : 0 : gcc_unreachable ();
5364 : :
5365 : 48777 : if (reduction_type == EXTRACT_LAST_REDUCTION)
5366 : : /* No extra instructions are needed in the prologue. The loop body
5367 : : operations are costed in vectorizable_condition. */
5368 : : inside_cost = 0;
5369 : 48777 : else if (reduction_type == FOLD_LEFT_REDUCTION)
5370 : : {
5371 : : /* No extra instructions needed in the prologue. */
5372 : 3791 : prologue_cost = 0;
5373 : :
5374 : 3791 : if (reduc_fn != IFN_LAST)
5375 : : /* Count one reduction-like operation per vector. */
5376 : 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5377 : : stmt_info, 0, vect_body);
5378 : : else
5379 : : {
5380 : : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5381 : 3791 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5382 : 3791 : inside_cost = record_stmt_cost (cost_vec, nelements,
5383 : : vec_to_scalar, stmt_info, 0,
5384 : : vect_body);
5385 : 3791 : inside_cost += record_stmt_cost (cost_vec, nelements,
5386 : : scalar_stmt, stmt_info, 0,
5387 : : vect_body);
5388 : : }
5389 : : }
5390 : : else
5391 : : {
5392 : : /* Add in the cost of the initial definitions. */
5393 : 44986 : int prologue_stmts;
5394 : 44986 : if (reduction_type == COND_REDUCTION)
5395 : : /* For cond reductions we have four vectors: initial index, step,
5396 : : initial result of the data reduction, initial value of the index
5397 : : reduction. */
5398 : : prologue_stmts = 4;
5399 : : else
5400 : : /* We need the initial reduction value. */
5401 : 44762 : prologue_stmts = 1;
5402 : 44986 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5403 : : scalar_to_vec, stmt_info, 0,
5404 : : vect_prologue);
5405 : : }
5406 : :
5407 : : /* Determine cost of epilogue code.
5408 : :
5409 : : We have a reduction operator that will reduce the vector in one statement.
5410 : : Also requires scalar extract. */
5411 : :
5412 : 48777 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5413 : : {
5414 : 48628 : if (reduc_fn != IFN_LAST)
5415 : : {
5416 : 37133 : if (reduction_type == COND_REDUCTION)
5417 : : {
5418 : : /* An EQ stmt and an COND_EXPR stmt. */
5419 : 9 : epilogue_cost += record_stmt_cost (cost_vec, 2,
5420 : : vector_stmt, stmt_info, 0,
5421 : : vect_epilogue);
5422 : : /* Reduction of the max index and a reduction of the found
5423 : : values. */
5424 : 9 : epilogue_cost += record_stmt_cost (cost_vec, 2,
5425 : : vec_to_scalar, stmt_info, 0,
5426 : : vect_epilogue);
5427 : : /* A broadcast of the max value. */
5428 : 9 : epilogue_cost += record_stmt_cost (cost_vec, 1,
5429 : : scalar_to_vec, stmt_info, 0,
5430 : : vect_epilogue);
5431 : : }
5432 : : else
5433 : : {
5434 : 37124 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5435 : : stmt_info, 0, vect_epilogue);
5436 : 37124 : epilogue_cost += record_stmt_cost (cost_vec, 1,
5437 : : vec_to_scalar, stmt_info, 0,
5438 : : vect_epilogue);
5439 : : }
5440 : : }
5441 : 11495 : else if (reduction_type == COND_REDUCTION)
5442 : : {
5443 : 215 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5444 : : /* Extraction of scalar elements. */
5445 : 430 : epilogue_cost += record_stmt_cost (cost_vec,
5446 : 215 : 2 * estimated_nunits,
5447 : : vec_to_scalar, stmt_info, 0,
5448 : : vect_epilogue);
5449 : : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5450 : 215 : epilogue_cost += record_stmt_cost (cost_vec,
5451 : 215 : 2 * estimated_nunits - 3,
5452 : : scalar_stmt, stmt_info, 0,
5453 : : vect_epilogue);
5454 : : }
5455 : 11280 : else if (reduction_type == EXTRACT_LAST_REDUCTION
5456 : 11280 : || reduction_type == FOLD_LEFT_REDUCTION)
5457 : : /* No extra instructions need in the epilogue. */
5458 : : ;
5459 : : else
5460 : : {
5461 : 7489 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5462 : 7489 : tree bitsize = TYPE_SIZE (op.type);
5463 : 7489 : int element_bitsize = tree_to_uhwi (bitsize);
5464 : 7489 : int nelements = vec_size_in_bits / element_bitsize;
5465 : :
5466 : 7489 : if (op.code == COND_EXPR)
5467 : 21 : op.code = MAX_EXPR;
5468 : :
5469 : : /* We have a whole vector shift available. */
5470 : 784 : if (VECTOR_MODE_P (mode)
5471 : 7489 : && directly_supported_p (op.code, vectype)
5472 : 12129 : && have_whole_vector_shift (mode))
5473 : : {
5474 : : /* Final reduction via vector shifts and the reduction operator.
5475 : : Also requires scalar extract. */
5476 : 13920 : epilogue_cost += record_stmt_cost (cost_vec,
5477 : 9280 : exact_log2 (nelements) * 2,
5478 : : vector_stmt, stmt_info, 0,
5479 : : vect_epilogue);
5480 : 4640 : epilogue_cost += record_stmt_cost (cost_vec, 1,
5481 : : vec_to_scalar, stmt_info, 0,
5482 : : vect_epilogue);
5483 : : }
5484 : : else
5485 : : /* Use extracts and reduction op for final reduction. For N
5486 : : elements, we have N extracts and N-1 reduction ops. */
5487 : 2849 : epilogue_cost += record_stmt_cost (cost_vec,
5488 : 2849 : nelements + nelements - 1,
5489 : : vector_stmt, stmt_info, 0,
5490 : : vect_epilogue);
5491 : : }
5492 : : }
5493 : :
5494 : 48777 : if (dump_enabled_p ())
5495 : 2945 : dump_printf (MSG_NOTE,
5496 : : "vect_model_reduction_cost: inside_cost = %d, "
5497 : : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5498 : : prologue_cost, epilogue_cost);
5499 : 48777 : }
5500 : :
5501 : : /* SEQ is a sequence of instructions that initialize the reduction
5502 : : described by REDUC_INFO. Emit them in the appropriate place. */
5503 : :
5504 : : static void
5505 : 371 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5506 : : stmt_vec_info reduc_info, gimple *seq)
5507 : : {
5508 : 371 : if (reduc_info->reused_accumulator)
5509 : : {
5510 : : /* When reusing an accumulator from the main loop, we only need
5511 : : initialization instructions if the main loop can be skipped.
5512 : : In that case, emit the initialization instructions at the end
5513 : : of the guard block that does the skip. */
5514 : 16 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5515 : 16 : gcc_assert (skip_edge);
5516 : 16 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5517 : 16 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5518 : : }
5519 : : else
5520 : : {
5521 : : /* The normal case: emit the initialization instructions on the
5522 : : preheader edge. */
5523 : 355 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5524 : 355 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5525 : : }
5526 : 371 : }
5527 : :
5528 : : /* Function get_initial_def_for_reduction
5529 : :
5530 : : Input:
5531 : : REDUC_INFO - the info_for_reduction
5532 : : INIT_VAL - the initial value of the reduction variable
5533 : : NEUTRAL_OP - a value that has no effect on the reduction, as per
5534 : : neutral_op_for_reduction
5535 : :
5536 : : Output:
5537 : : Return a vector variable, initialized according to the operation that
5538 : : STMT_VINFO performs. This vector will be used as the initial value
5539 : : of the vector of partial results.
5540 : :
5541 : : The value we need is a vector in which element 0 has value INIT_VAL
5542 : : and every other element has value NEUTRAL_OP. */
5543 : :
5544 : : static tree
5545 : 0 : get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5546 : : stmt_vec_info reduc_info,
5547 : : tree init_val, tree neutral_op)
5548 : : {
5549 : 0 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5550 : 0 : tree scalar_type = TREE_TYPE (init_val);
5551 : 0 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5552 : 0 : tree init_def;
5553 : 0 : gimple_seq stmts = NULL;
5554 : :
5555 : 0 : gcc_assert (vectype);
5556 : :
5557 : 0 : gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5558 : : || SCALAR_FLOAT_TYPE_P (scalar_type));
5559 : :
5560 : 0 : gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5561 : : || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5562 : :
5563 : 0 : if (operand_equal_p (init_val, neutral_op))
5564 : : {
5565 : : /* If both elements are equal then the vector described above is
5566 : : just a splat. */
5567 : 0 : neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5568 : 0 : init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5569 : : }
5570 : : else
5571 : : {
5572 : 0 : neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5573 : 0 : init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5574 : 0 : if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5575 : : {
5576 : : /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5577 : : element 0. */
5578 : : init_def = gimple_build_vector_from_val (&stmts, vectype,
5579 : : neutral_op);
5580 : : init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5581 : : vectype, init_def, init_val);
5582 : : }
5583 : : else
5584 : : {
5585 : : /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5586 : 0 : tree_vector_builder elts (vectype, 1, 2);
5587 : 0 : elts.quick_push (init_val);
5588 : 0 : elts.quick_push (neutral_op);
5589 : 0 : init_def = gimple_build_vector (&stmts, &elts);
5590 : 0 : }
5591 : : }
5592 : :
5593 : 0 : if (stmts)
5594 : 0 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5595 : 0 : return init_def;
5596 : : }
5597 : :
5598 : : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5599 : : which performs a reduction involving GROUP_SIZE scalar statements.
5600 : : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5601 : : is nonnull, introducing extra elements of that value will not change the
5602 : : result. */
5603 : :
5604 : : static void
5605 : 20563 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5606 : : stmt_vec_info reduc_info,
5607 : : vec<tree> *vec_oprnds,
5608 : : unsigned int number_of_vectors,
5609 : : unsigned int group_size, tree neutral_op)
5610 : : {
5611 : 20563 : vec<tree> &initial_values = reduc_info->reduc_initial_values;
5612 : 20563 : unsigned HOST_WIDE_INT nunits;
5613 : 20563 : unsigned j, number_of_places_left_in_vector;
5614 : 20563 : tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5615 : 20563 : unsigned int i;
5616 : :
5617 : 41126 : gcc_assert (group_size == initial_values.length () || neutral_op);
5618 : :
5619 : : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5620 : : created vectors. It is greater than 1 if unrolling is performed.
5621 : :
5622 : : For example, we have two scalar operands, s1 and s2 (e.g., group of
5623 : : strided accesses of size two), while NUNITS is four (i.e., four scalars
5624 : : of this type can be packed in a vector). The output vector will contain
5625 : : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5626 : : will be 2).
5627 : :
5628 : : If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5629 : : vectors containing the operands.
5630 : :
5631 : : For example, NUNITS is four as before, and the group size is 8
5632 : : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5633 : : {s5, s6, s7, s8}. */
5634 : :
5635 : 20563 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5636 : : nunits = group_size;
5637 : :
5638 : 20563 : number_of_places_left_in_vector = nunits;
5639 : 20563 : bool constant_p = true;
5640 : 20563 : tree_vector_builder elts (vector_type, nunits, 1);
5641 : 20563 : elts.quick_grow (nunits);
5642 : 20563 : gimple_seq ctor_seq = NULL;
5643 : 20563 : if (neutral_op
5644 : 41039 : && !useless_type_conversion_p (TREE_TYPE (vector_type),
5645 : 20476 : TREE_TYPE (neutral_op)))
5646 : 1 : neutral_op = gimple_convert (&ctor_seq,
5647 : 1 : TREE_TYPE (vector_type),
5648 : : neutral_op);
5649 : 195930 : for (j = 0; j < nunits * number_of_vectors; ++j)
5650 : : {
5651 : 175367 : tree op;
5652 : 175367 : i = j % group_size;
5653 : :
5654 : : /* Get the def before the loop. In reduction chain we have only
5655 : : one initial value. Else we have as many as PHIs in the group. */
5656 : 175367 : if (i >= initial_values.length () || (j > i && neutral_op))
5657 : : op = neutral_op;
5658 : : else
5659 : : {
5660 : 42392 : if (!useless_type_conversion_p (TREE_TYPE (vector_type),
5661 : 21196 : TREE_TYPE (initial_values[i])))
5662 : 6 : initial_values[i] = gimple_convert (&ctor_seq,
5663 : 3 : TREE_TYPE (vector_type),
5664 : 3 : initial_values[i]);
5665 : 21196 : op = initial_values[i];
5666 : : }
5667 : :
5668 : : /* Create 'vect_ = {op0,op1,...,opn}'. */
5669 : 175367 : number_of_places_left_in_vector--;
5670 : 175367 : elts[nunits - number_of_places_left_in_vector - 1] = op;
5671 : 175367 : if (!CONSTANT_CLASS_P (op))
5672 : 2144 : constant_p = false;
5673 : :
5674 : 175367 : if (number_of_places_left_in_vector == 0)
5675 : : {
5676 : 21554 : tree init;
5677 : 43108 : if (constant_p && !neutral_op
5678 : 43057 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5679 : 21554 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5680 : : /* Build the vector directly from ELTS. */
5681 : 21554 : init = gimple_build_vector (&ctor_seq, &elts);
5682 : 0 : else if (neutral_op)
5683 : : {
5684 : : /* Build a vector of the neutral value and shift the
5685 : : other elements into place. */
5686 : 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5687 : : neutral_op);
5688 : 0 : int k = nunits;
5689 : 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
5690 : : k -= 1;
5691 : 0 : while (k > 0)
5692 : : {
5693 : 0 : k -= 1;
5694 : 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5695 : 0 : vector_type, init, elts[k]);
5696 : : }
5697 : : }
5698 : : else
5699 : : {
5700 : : /* First time round, duplicate ELTS to fill the
5701 : : required number of vectors. */
5702 : 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5703 : : elts, number_of_vectors, *vec_oprnds);
5704 : 0 : break;
5705 : : }
5706 : 21554 : vec_oprnds->quick_push (init);
5707 : :
5708 : 21554 : number_of_places_left_in_vector = nunits;
5709 : 21554 : elts.new_vector (vector_type, nunits, 1);
5710 : 21554 : elts.quick_grow (nunits);
5711 : 21554 : constant_p = true;
5712 : : }
5713 : : }
5714 : 20563 : if (ctor_seq != NULL)
5715 : 371 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5716 : 20563 : }
5717 : :
5718 : : /* For a statement STMT_INFO taking part in a reduction operation return
5719 : : the stmt_vec_info the meta information is stored on. */
5720 : :
5721 : : stmt_vec_info
5722 : 127458 : info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5723 : : {
5724 : 127458 : stmt_info = vect_orig_stmt (stmt_info);
5725 : 127458 : gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5726 : 127458 : if (!is_a <gphi *> (stmt_info->stmt)
5727 : 127458 : || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5728 : 52001 : stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5729 : 127458 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
5730 : 127458 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5731 : : {
5732 : 580 : if (gimple_phi_num_args (phi) == 1)
5733 : 241 : stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5734 : : }
5735 : 126878 : else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5736 : : {
5737 : 2556 : stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5738 : 2556 : if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5739 : 127458 : stmt_info = info;
5740 : : }
5741 : 127458 : return stmt_info;
5742 : : }
5743 : :
5744 : : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5745 : : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5746 : : return false. */
5747 : :
5748 : : static bool
5749 : 20369 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5750 : : stmt_vec_info reduc_info)
5751 : : {
5752 : 20369 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5753 : 20369 : if (!main_loop_vinfo)
5754 : : return false;
5755 : :
5756 : 4621 : if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5757 : : return false;
5758 : :
5759 : 4603 : unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5760 : 4603 : auto_vec<tree, 16> main_loop_results (num_phis);
5761 : 4603 : auto_vec<tree, 16> initial_values (num_phis);
5762 : 4603 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5763 : : {
5764 : : /* The epilogue loop can be entered either from the main loop or
5765 : : from an earlier guard block. */
5766 : 4430 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5767 : 17736 : for (tree incoming_value : reduc_info->reduc_initial_values)
5768 : : {
5769 : : /* Look for:
5770 : :
5771 : : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5772 : : INITIAL_VALUE(guard block)>. */
5773 : 4446 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5774 : :
5775 : 4446 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5776 : 4446 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5777 : :
5778 : 4446 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5779 : 4446 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5780 : :
5781 : 4446 : main_loop_results.quick_push (from_main_loop);
5782 : 4446 : initial_values.quick_push (from_skip);
5783 : : }
5784 : : }
5785 : : else
5786 : : /* The main loop dominates the epilogue loop. */
5787 : 173 : main_loop_results.splice (reduc_info->reduc_initial_values);
5788 : :
5789 : : /* See if the main loop has the kind of accumulator we need. */
5790 : 4603 : vect_reusable_accumulator *accumulator
5791 : 4603 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5792 : 4603 : if (!accumulator
5793 : 9192 : || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5794 : 13795 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5795 : : accumulator->reduc_info->reduc_scalar_results.begin ()))
5796 : : return false;
5797 : :
5798 : : /* Handle the case where we can reduce wider vectors to narrower ones. */
5799 : 4590 : tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5800 : 4590 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5801 : 4590 : unsigned HOST_WIDE_INT m;
5802 : 4590 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5803 : 4590 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5804 : 0 : return false;
5805 : : /* Check the intermediate vector types and operations are available. */
5806 : 4590 : tree prev_vectype = old_vectype;
5807 : 4590 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5808 : 13252 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5809 : : {
5810 : 4592 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5811 : 4592 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5812 : 4592 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5813 : 4592 : if (!intermediate_vectype
5814 : 4592 : || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5815 : : intermediate_vectype)
5816 : 8664 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5817 : 4072 : TYPE_MODE (intermediate_vectype)))
5818 : : return false;
5819 : : prev_vectype = intermediate_vectype;
5820 : : }
5821 : :
5822 : : /* Non-SLP reductions might apply an adjustment after the reduction
5823 : : operation, in order to simplify the initialization of the accumulator.
5824 : : If the epilogue loop carries on from where the main loop left off,
5825 : : it should apply the same adjustment to the final reduction result.
5826 : :
5827 : : If the epilogue loop can also be entered directly (rather than via
5828 : : the main loop), we need to be able to handle that case in the same way,
5829 : : with the same adjustment. (In principle we could add a PHI node
5830 : : to select the correct adjustment, but in practice that shouldn't be
5831 : : necessary.) */
5832 : 4070 : tree main_adjustment
5833 : 4070 : = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5834 : 4070 : if (loop_vinfo->main_loop_edge && main_adjustment)
5835 : : {
5836 : 3582 : gcc_assert (num_phis == 1);
5837 : 3582 : tree initial_value = initial_values[0];
5838 : : /* Check that we can use INITIAL_VALUE as the adjustment and
5839 : : initialize the accumulator with a neutral value instead. */
5840 : 3582 : if (!operand_equal_p (initial_value, main_adjustment))
5841 : 101 : return false;
5842 : 3481 : code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5843 : 3481 : initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5844 : : code, initial_value);
5845 : : }
5846 : 3969 : STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5847 : 3969 : reduc_info->reduc_initial_values.truncate (0);
5848 : 3969 : reduc_info->reduc_initial_values.splice (initial_values);
5849 : 3969 : reduc_info->reused_accumulator = accumulator;
5850 : 3969 : return true;
5851 : 4603 : }
5852 : :
5853 : : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5854 : : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5855 : :
5856 : : static tree
5857 : 5644 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5858 : : gimple_seq *seq)
5859 : : {
5860 : 5644 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5861 : 5644 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5862 : 5644 : tree stype = TREE_TYPE (vectype);
5863 : 5644 : tree new_temp = vec_def;
5864 : 9650 : while (nunits > nunits1)
5865 : : {
5866 : 4006 : nunits /= 2;
5867 : 4006 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5868 : : stype, nunits);
5869 : 4006 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5870 : :
5871 : : /* The target has to make sure we support lowpart/highpart
5872 : : extraction, either via direct vector extract or through
5873 : : an integer mode punning. */
5874 : 4006 : tree dst1, dst2;
5875 : 4006 : gimple *epilog_stmt;
5876 : 4006 : if (convert_optab_handler (vec_extract_optab,
5877 : 4006 : TYPE_MODE (TREE_TYPE (new_temp)),
5878 : 4006 : TYPE_MODE (vectype1))
5879 : : != CODE_FOR_nothing)
5880 : : {
5881 : : /* Extract sub-vectors directly once vec_extract becomes
5882 : : a conversion optab. */
5883 : 2605 : dst1 = make_ssa_name (vectype1);
5884 : 2605 : epilog_stmt
5885 : 5210 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5886 : : build3 (BIT_FIELD_REF, vectype1,
5887 : 2605 : new_temp, TYPE_SIZE (vectype1),
5888 : 2605 : bitsize_int (0)));
5889 : 2605 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5890 : 2605 : dst2 = make_ssa_name (vectype1);
5891 : 2605 : epilog_stmt
5892 : 2605 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5893 : : build3 (BIT_FIELD_REF, vectype1,
5894 : 2605 : new_temp, TYPE_SIZE (vectype1),
5895 : 2605 : bitsize_int (bitsize)));
5896 : 2605 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5897 : : }
5898 : : else
5899 : : {
5900 : : /* Extract via punning to appropriately sized integer mode
5901 : : vector. */
5902 : 1401 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5903 : 1401 : tree etype = build_vector_type (eltype, 2);
5904 : 2802 : gcc_assert (convert_optab_handler (vec_extract_optab,
5905 : : TYPE_MODE (etype),
5906 : : TYPE_MODE (eltype))
5907 : : != CODE_FOR_nothing);
5908 : 1401 : tree tem = make_ssa_name (etype);
5909 : 1401 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5910 : : build1 (VIEW_CONVERT_EXPR,
5911 : : etype, new_temp));
5912 : 1401 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5913 : 1401 : new_temp = tem;
5914 : 1401 : tem = make_ssa_name (eltype);
5915 : 1401 : epilog_stmt
5916 : 2802 : = gimple_build_assign (tem, BIT_FIELD_REF,
5917 : : build3 (BIT_FIELD_REF, eltype,
5918 : 1401 : new_temp, TYPE_SIZE (eltype),
5919 : 1401 : bitsize_int (0)));
5920 : 1401 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5921 : 1401 : dst1 = make_ssa_name (vectype1);
5922 : 1401 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5923 : : build1 (VIEW_CONVERT_EXPR,
5924 : : vectype1, tem));
5925 : 1401 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5926 : 1401 : tem = make_ssa_name (eltype);
5927 : 1401 : epilog_stmt
5928 : 1401 : = gimple_build_assign (tem, BIT_FIELD_REF,
5929 : : build3 (BIT_FIELD_REF, eltype,
5930 : 1401 : new_temp, TYPE_SIZE (eltype),
5931 : 1401 : bitsize_int (bitsize)));
5932 : 1401 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5933 : 1401 : dst2 = make_ssa_name (vectype1);
5934 : 1401 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5935 : : build1 (VIEW_CONVERT_EXPR,
5936 : : vectype1, tem));
5937 : 1401 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5938 : : }
5939 : :
5940 : 4006 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5941 : : }
5942 : :
5943 : 5644 : return new_temp;
5944 : : }
5945 : :
5946 : : /* Function vect_create_epilog_for_reduction
5947 : :
5948 : : Create code at the loop-epilog to finalize the result of a reduction
5949 : : computation.
5950 : :
5951 : : STMT_INFO is the scalar reduction stmt that is being vectorized.
5952 : : SLP_NODE is an SLP node containing a group of reduction statements. The
5953 : : first one in this group is STMT_INFO.
5954 : : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5955 : : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5956 : : (counting from 0)
5957 : : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5958 : : exit this edge is always the main loop exit.
5959 : :
5960 : : This function:
5961 : : 1. Completes the reduction def-use cycles.
5962 : : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5963 : : by calling the function specified by REDUC_FN if available, or by
5964 : : other means (whole-vector shifts or a scalar loop).
5965 : : The function also creates a new phi node at the loop exit to preserve
5966 : : loop-closed form, as illustrated below.
5967 : :
5968 : : The flow at the entry to this function:
5969 : :
5970 : : loop:
5971 : : vec_def = phi <vec_init, null> # REDUCTION_PHI
5972 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5973 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5974 : : loop_exit:
5975 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5976 : : use <s_out0>
5977 : : use <s_out0>
5978 : :
5979 : : The above is transformed by this function into:
5980 : :
5981 : : loop:
5982 : : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5983 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5984 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5985 : : loop_exit:
5986 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5987 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5988 : : v_out2 = reduce <v_out1>
5989 : : s_out3 = extract_field <v_out2, 0>
5990 : : s_out4 = adjust_result <s_out3>
5991 : : use <s_out4>
5992 : : use <s_out4>
5993 : : */
5994 : :
5995 : : static void
5996 : 20874 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5997 : : stmt_vec_info stmt_info,
5998 : : slp_tree slp_node,
5999 : : slp_instance slp_node_instance,
6000 : : edge loop_exit)
6001 : : {
6002 : 20874 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6003 : 20874 : gcc_assert (reduc_info->is_reduc_info);
6004 : : /* For double reductions we need to get at the inner loop reduction
6005 : : stmt which has the meta info attached. Our stmt_info is that of the
6006 : : loop-closed PHI of the inner loop which we remember as
6007 : : def for the reduction PHI generation. */
6008 : 20874 : bool double_reduc = false;
6009 : 20874 : stmt_vec_info rdef_info = stmt_info;
6010 : 20874 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6011 : : {
6012 : 66 : double_reduc = true;
6013 : 66 : stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
6014 : 66 : (stmt_info->stmt, 0));
6015 : 66 : stmt_info = vect_stmt_to_vectorize (stmt_info);
6016 : : }
6017 : 20874 : code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
6018 : 20874 : internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6019 : 20874 : tree vectype;
6020 : 20874 : machine_mode mode;
6021 : 20874 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
6022 : 20874 : basic_block exit_bb;
6023 : 20874 : tree scalar_dest;
6024 : 20874 : tree scalar_type;
6025 : 20874 : gimple *new_phi = NULL, *phi = NULL;
6026 : 20874 : gimple_stmt_iterator exit_gsi;
6027 : 20874 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
6028 : 20874 : gimple *epilog_stmt = NULL;
6029 : 20874 : gimple *exit_phi;
6030 : 20874 : tree bitsize;
6031 : 20874 : tree def;
6032 : 20874 : tree orig_name, scalar_result;
6033 : 20874 : imm_use_iterator imm_iter, phi_imm_iter;
6034 : 20874 : use_operand_p use_p, phi_use_p;
6035 : 20874 : gimple *use_stmt;
6036 : 20874 : auto_vec<tree> reduc_inputs;
6037 : 20874 : int j, i;
6038 : 20874 : vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6039 : 20874 : unsigned int group_size = 1, k;
6040 : : /* SLP reduction without reduction chain, e.g.,
6041 : : # a1 = phi <a2, a0>
6042 : : # b1 = phi <b2, b0>
6043 : : a2 = operation (a1)
6044 : : b2 = operation (b1) */
6045 : 20874 : bool slp_reduc
6046 : : = (slp_node
6047 : 20874 : && !REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (reduc_info)));
6048 : 20874 : bool direct_slp_reduc;
6049 : 41748 : tree induction_index = NULL_TREE;
6050 : :
6051 : 20874 : if (slp_node)
6052 : 20874 : group_size = SLP_TREE_LANES (slp_node);
6053 : :
6054 : 20874 : if (nested_in_vect_loop_p (loop, stmt_info))
6055 : : {
6056 : 66 : outer_loop = loop;
6057 : 66 : loop = loop->inner;
6058 : 66 : gcc_assert (double_reduc);
6059 : : }
6060 : :
6061 : 20874 : vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6062 : 20874 : gcc_assert (vectype);
6063 : 20874 : mode = TYPE_MODE (vectype);
6064 : :
6065 : 20874 : tree induc_val = NULL_TREE;
6066 : 20874 : tree adjustment_def = NULL;
6067 : : /* Optimize: for induction condition reduction, if we can't use zero
6068 : : for induc_val, use initial_def. */
6069 : 20874 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6070 : 66 : induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6071 : 20808 : else if (double_reduc)
6072 : : ;
6073 : : else
6074 : 20742 : adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6075 : :
6076 : 20874 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
6077 : 20874 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6078 : 20874 : if (slp_reduc)
6079 : : /* All statements produce live-out values. */
6080 : 41414 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6081 : :
6082 : 20874 : unsigned vec_num;
6083 : 20874 : int ncopies;
6084 : 20874 : if (slp_node)
6085 : : {
6086 : 20874 : vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6087 : : ncopies = 1;
6088 : : }
6089 : : else
6090 : : {
6091 : 0 : vec_num = 1;
6092 : 0 : ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6093 : : }
6094 : :
6095 : : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6096 : : which is updated with the current index of the loop for every match of
6097 : : the original loop's cond_expr (VEC_STMT). This results in a vector
6098 : : containing the last time the condition passed for that vector lane.
6099 : : The first match will be a 1 to allow 0 to be used for non-matching
6100 : : indexes. If there are no matches at all then the vector will be all
6101 : : zeroes.
6102 : :
6103 : : PR92772: This algorithm is broken for architectures that support
6104 : : masked vectors, but do not provide fold_extract_last. */
6105 : 20874 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6106 : : {
6107 : 73 : auto_vec<std::pair<tree, bool>, 2> ccompares;
6108 : 73 : if (slp_node)
6109 : : {
6110 : 73 : slp_tree cond_node = slp_node_instance->root;
6111 : 167 : while (cond_node != slp_node_instance->reduc_phis)
6112 : : {
6113 : 94 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
6114 : 94 : int slp_reduc_idx;
6115 : 94 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6116 : : {
6117 : 82 : gimple *vec_stmt
6118 : 82 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
6119 : 82 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6120 : 82 : ccompares.safe_push
6121 : 82 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
6122 : 82 : STMT_VINFO_REDUC_IDX (cond_info) == 2));
6123 : : /* ??? We probably want to have REDUC_IDX on the SLP node?
6124 : : We have both three and four children COND_EXPR nodes
6125 : : dependent on whether the comparison is still embedded
6126 : : as GENERIC. So work backwards. */
6127 : 82 : slp_reduc_idx = (SLP_TREE_CHILDREN (cond_node).length () - 3
6128 : 82 : + STMT_VINFO_REDUC_IDX (cond_info));
6129 : : }
6130 : : else
6131 : 12 : slp_reduc_idx = STMT_VINFO_REDUC_IDX (cond_info);
6132 : 94 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
6133 : : }
6134 : : }
6135 : : else
6136 : : {
6137 : 0 : stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6138 : 0 : cond_info = vect_stmt_to_vectorize (cond_info);
6139 : 0 : while (cond_info != reduc_info)
6140 : : {
6141 : 0 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6142 : : {
6143 : 0 : gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6144 : 0 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6145 : 0 : ccompares.safe_push
6146 : 0 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
6147 : 0 : STMT_VINFO_REDUC_IDX (cond_info) == 2));
6148 : : }
6149 : 0 : cond_info
6150 : 0 : = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6151 : 0 : 1 + STMT_VINFO_REDUC_IDX
6152 : : (cond_info)));
6153 : 0 : cond_info = vect_stmt_to_vectorize (cond_info);
6154 : : }
6155 : : }
6156 : 73 : gcc_assert (ccompares.length () != 0);
6157 : :
6158 : 73 : tree indx_before_incr, indx_after_incr;
6159 : 73 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6160 : 73 : int scalar_precision
6161 : 73 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6162 : 73 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6163 : 73 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
6164 : 73 : (TYPE_MODE (vectype), cr_index_scalar_type,
6165 : : TYPE_VECTOR_SUBPARTS (vectype));
6166 : :
6167 : : /* First we create a simple vector induction variable which starts
6168 : : with the values {1,2,3,...} (SERIES_VECT) and increments by the
6169 : : vector size (STEP). */
6170 : :
6171 : : /* Create a {1,2,3,...} vector. */
6172 : 73 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6173 : :
6174 : : /* Create a vector of the step value. */
6175 : 73 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6176 : 73 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6177 : :
6178 : : /* Create an induction variable. */
6179 : 73 : gimple_stmt_iterator incr_gsi;
6180 : 73 : bool insert_after;
6181 : 73 : vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6182 : 73 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6183 : : insert_after, &indx_before_incr, &indx_after_incr);
6184 : :
6185 : : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6186 : : filled with zeros (VEC_ZERO). */
6187 : :
6188 : : /* Create a vector of 0s. */
6189 : 73 : tree zero = build_zero_cst (cr_index_scalar_type);
6190 : 73 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6191 : :
6192 : : /* Create a vector phi node. */
6193 : 73 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6194 : 73 : new_phi = create_phi_node (new_phi_tree, loop->header);
6195 : 73 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6196 : : loop_preheader_edge (loop), UNKNOWN_LOCATION);
6197 : :
6198 : : /* Now take the condition from the loops original cond_exprs
6199 : : and produce a new cond_exprs (INDEX_COND_EXPR) which for
6200 : : every match uses values from the induction variable
6201 : : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6202 : : (NEW_PHI_TREE).
6203 : : Finally, we update the phi (NEW_PHI_TREE) to take the value of
6204 : : the new cond_expr (INDEX_COND_EXPR). */
6205 : 73 : gimple_seq stmts = NULL;
6206 : 228 : for (int i = ccompares.length () - 1; i != -1; --i)
6207 : : {
6208 : 82 : tree ccompare = ccompares[i].first;
6209 : 82 : if (ccompares[i].second)
6210 : 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6211 : : cr_index_vector_type,
6212 : : ccompare,
6213 : : indx_before_incr, new_phi_tree);
6214 : : else
6215 : 13 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6216 : : cr_index_vector_type,
6217 : : ccompare,
6218 : : new_phi_tree, indx_before_incr);
6219 : : }
6220 : 73 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6221 : :
6222 : : /* Update the phi with the vec cond. */
6223 : 73 : induction_index = new_phi_tree;
6224 : 73 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6225 : : loop_latch_edge (loop), UNKNOWN_LOCATION);
6226 : 73 : }
6227 : :
6228 : : /* 2. Create epilog code.
6229 : : The reduction epilog code operates across the elements of the vector
6230 : : of partial results computed by the vectorized loop.
6231 : : The reduction epilog code consists of:
6232 : :
6233 : : step 1: compute the scalar result in a vector (v_out2)
6234 : : step 2: extract the scalar result (s_out3) from the vector (v_out2)
6235 : : step 3: adjust the scalar result (s_out3) if needed.
6236 : :
6237 : : Step 1 can be accomplished using one the following three schemes:
6238 : : (scheme 1) using reduc_fn, if available.
6239 : : (scheme 2) using whole-vector shifts, if available.
6240 : : (scheme 3) using a scalar loop. In this case steps 1+2 above are
6241 : : combined.
6242 : :
6243 : : The overall epilog code looks like this:
6244 : :
6245 : : s_out0 = phi <s_loop> # original EXIT_PHI
6246 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6247 : : v_out2 = reduce <v_out1> # step 1
6248 : : s_out3 = extract_field <v_out2, 0> # step 2
6249 : : s_out4 = adjust_result <s_out3> # step 3
6250 : :
6251 : : (step 3 is optional, and steps 1 and 2 may be combined).
6252 : : Lastly, the uses of s_out0 are replaced by s_out4. */
6253 : :
6254 : :
6255 : : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6256 : : v_out1 = phi <VECT_DEF>
6257 : : Store them in NEW_PHIS. */
6258 : 20874 : if (double_reduc)
6259 : 66 : loop = outer_loop;
6260 : : /* We need to reduce values in all exits. */
6261 : 20874 : exit_bb = loop_exit->dest;
6262 : 20874 : exit_gsi = gsi_after_labels (exit_bb);
6263 : 20874 : reduc_inputs.create (slp_node ? vec_num : ncopies);
6264 : 42743 : for (unsigned i = 0; i < vec_num; i++)
6265 : : {
6266 : 21869 : gimple_seq stmts = NULL;
6267 : 21869 : if (slp_node)
6268 : 21869 : def = vect_get_slp_vect_def (slp_node, i);
6269 : : else
6270 : 0 : def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6271 : 43738 : for (j = 0; j < ncopies; j++)
6272 : : {
6273 : 21869 : tree new_def = copy_ssa_name (def);
6274 : 21869 : phi = create_phi_node (new_def, exit_bb);
6275 : 21869 : if (j)
6276 : 0 : def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6277 : 21869 : if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6278 : 21844 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6279 : : else
6280 : : {
6281 : 53 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6282 : 28 : SET_PHI_ARG_DEF (phi, k, def);
6283 : : }
6284 : 21869 : new_def = gimple_convert (&stmts, vectype, new_def);
6285 : 21869 : reduc_inputs.quick_push (new_def);
6286 : : }
6287 : 21869 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6288 : : }
6289 : :
6290 : : /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6291 : : (i.e. when reduc_fn is not available) and in the final adjustment
6292 : : code (if needed). Also get the original scalar reduction variable as
6293 : : defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6294 : : represents a reduction pattern), the tree-code and scalar-def are
6295 : : taken from the original stmt that the pattern-stmt (STMT) replaces.
6296 : : Otherwise (it is a regular reduction) - the tree-code and scalar-def
6297 : : are taken from STMT. */
6298 : :
6299 : 20874 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6300 : 20874 : if (orig_stmt_info != stmt_info)
6301 : : {
6302 : : /* Reduction pattern */
6303 : 585 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6304 : 585 : gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6305 : : }
6306 : :
6307 : 20874 : scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6308 : 20874 : scalar_type = TREE_TYPE (scalar_dest);
6309 : 20874 : scalar_results.truncate (0);
6310 : 20874 : scalar_results.reserve_exact (group_size);
6311 : 20874 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6312 : 20874 : bitsize = TYPE_SIZE (scalar_type);
6313 : :
6314 : : /* True if we should implement SLP_REDUC using native reduction operations
6315 : : instead of scalar operations. */
6316 : 41748 : direct_slp_reduc = (reduc_fn != IFN_LAST
6317 : 20874 : && slp_reduc
6318 : 20874 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6319 : :
6320 : : /* In case of reduction chain, e.g.,
6321 : : # a1 = phi <a3, a0>
6322 : : a2 = operation (a1)
6323 : : a3 = operation (a2),
6324 : :
6325 : : we may end up with more than one vector result. Here we reduce them
6326 : : to one vector.
6327 : :
6328 : : The same is true for a SLP reduction, e.g.,
6329 : : # a1 = phi <a2, a0>
6330 : : # b1 = phi <b2, b0>
6331 : : a2 = operation (a1)
6332 : : b2 = operation (a2),
6333 : :
6334 : : where we can end up with more than one vector as well. We can
6335 : : easily accumulate vectors when the number of vector elements is
6336 : : a multiple of the SLP group size.
6337 : :
6338 : : The same is true if we couldn't use a single defuse cycle. */
6339 : 20874 : if (REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (reduc_info))
6340 : : || direct_slp_reduc
6341 : 20707 : || (slp_reduc
6342 : 20732 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6343 : 20899 : || ncopies > 1)
6344 : : {
6345 : 20849 : gimple_seq stmts = NULL;
6346 : 20849 : tree single_input = reduc_inputs[0];
6347 : 21801 : for (k = 1; k < reduc_inputs.length (); k++)
6348 : 1904 : single_input = gimple_build (&stmts, code, vectype,
6349 : 952 : single_input, reduc_inputs[k]);
6350 : 20849 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6351 : :
6352 : 20849 : reduc_inputs.truncate (0);
6353 : 20849 : reduc_inputs.safe_push (single_input);
6354 : : }
6355 : :
6356 : 20874 : tree orig_reduc_input = reduc_inputs[0];
6357 : :
6358 : : /* If this loop is an epilogue loop that can be skipped after the
6359 : : main loop, we can only share a reduction operation between the
6360 : : main loop and the epilogue if we put it at the target of the
6361 : : skip edge.
6362 : :
6363 : : We can still reuse accumulators if this check fails. Doing so has
6364 : : the minor(?) benefit of making the epilogue loop's scalar result
6365 : : independent of the main loop's scalar result. */
6366 : 20874 : bool unify_with_main_loop_p = false;
6367 : 20874 : if (reduc_info->reused_accumulator
6368 : 3969 : && loop_vinfo->skip_this_loop_edge
6369 : 3783 : && single_succ_p (exit_bb)
6370 : 20883 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6371 : : {
6372 : 9 : unify_with_main_loop_p = true;
6373 : :
6374 : 9 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6375 : 9 : reduc_inputs[0] = make_ssa_name (vectype);
6376 : 9 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6377 : 9 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6378 : : UNKNOWN_LOCATION);
6379 : 9 : add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6380 : : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6381 : 9 : exit_gsi = gsi_after_labels (reduc_block);
6382 : : }
6383 : :
6384 : : /* Shouldn't be used beyond this point. */
6385 : 20874 : exit_bb = nullptr;
6386 : :
6387 : 20874 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6388 : 73 : && reduc_fn != IFN_LAST)
6389 : : {
6390 : : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6391 : : various data values where the condition matched and another vector
6392 : : (INDUCTION_INDEX) containing all the indexes of those matches. We
6393 : : need to extract the last matching index (which will be the index with
6394 : : highest value) and use this to index into the data vector.
6395 : : For the case where there were no matches, the data vector will contain
6396 : : all default values and the index vector will be all zeros. */
6397 : :
6398 : : /* Get various versions of the type of the vector of indexes. */
6399 : 4 : tree index_vec_type = TREE_TYPE (induction_index);
6400 : 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6401 : 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
6402 : 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
6403 : :
6404 : : /* Get an unsigned integer version of the type of the data vector. */
6405 : 4 : int scalar_precision
6406 : 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6407 : 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6408 : 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6409 : : vectype);
6410 : :
6411 : : /* First we need to create a vector (ZERO_VEC) of zeros and another
6412 : : vector (MAX_INDEX_VEC) filled with the last matching index, which we
6413 : : can create using a MAX reduction and then expanding.
6414 : : In the case where the loop never made any matches, the max index will
6415 : : be zero. */
6416 : :
6417 : : /* Vector of {0, 0, 0,...}. */
6418 : 4 : tree zero_vec = build_zero_cst (vectype);
6419 : :
6420 : : /* Find maximum value from the vector of found indexes. */
6421 : 4 : tree max_index = make_ssa_name (index_scalar_type);
6422 : 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6423 : : 1, induction_index);
6424 : 4 : gimple_call_set_lhs (max_index_stmt, max_index);
6425 : 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6426 : :
6427 : : /* Vector of {max_index, max_index, max_index,...}. */
6428 : 4 : tree max_index_vec = make_ssa_name (index_vec_type);
6429 : 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6430 : : max_index);
6431 : 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6432 : : max_index_vec_rhs);
6433 : 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6434 : :
6435 : : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6436 : : with the vector (INDUCTION_INDEX) of found indexes, choosing values
6437 : : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6438 : : otherwise. Only one value should match, resulting in a vector
6439 : : (VEC_COND) with one data value and the rest zeros.
6440 : : In the case where the loop never made any matches, every index will
6441 : : match, resulting in a vector with all data values (which will all be
6442 : : the default value). */
6443 : :
6444 : : /* Compare the max index vector to the vector of found indexes to find
6445 : : the position of the max value. */
6446 : 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
6447 : 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6448 : : induction_index,
6449 : : max_index_vec);
6450 : 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6451 : :
6452 : : /* Use the compare to choose either values from the data vector or
6453 : : zero. */
6454 : 4 : tree vec_cond = make_ssa_name (vectype);
6455 : 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6456 : : vec_compare,
6457 : 4 : reduc_inputs[0],
6458 : : zero_vec);
6459 : 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6460 : :
6461 : : /* Finally we need to extract the data value from the vector (VEC_COND)
6462 : : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6463 : : reduction, but because this doesn't exist, we can use a MAX reduction
6464 : : instead. The data value might be signed or a float so we need to cast
6465 : : it first.
6466 : : In the case where the loop never made any matches, the data values are
6467 : : all identical, and so will reduce down correctly. */
6468 : :
6469 : : /* Make the matched data values unsigned. */
6470 : 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6471 : 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6472 : : vec_cond);
6473 : 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6474 : : VIEW_CONVERT_EXPR,
6475 : : vec_cond_cast_rhs);
6476 : 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6477 : :
6478 : : /* Reduce down to a scalar value. */
6479 : 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
6480 : 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6481 : : 1, vec_cond_cast);
6482 : 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6483 : 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6484 : :
6485 : : /* Convert the reduced value back to the result type and set as the
6486 : : result. */
6487 : 4 : gimple_seq stmts = NULL;
6488 : 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6489 : : data_reduc);
6490 : 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6491 : 4 : scalar_results.safe_push (new_temp);
6492 : 4 : }
6493 : 20870 : else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6494 : 69 : && reduc_fn == IFN_LAST)
6495 : : {
6496 : : /* Condition reduction without supported IFN_REDUC_MAX. Generate
6497 : : idx = 0;
6498 : : idx_val = induction_index[0];
6499 : : val = data_reduc[0];
6500 : : for (idx = 0, val = init, i = 0; i < nelts; ++i)
6501 : : if (induction_index[i] > idx_val)
6502 : : val = data_reduc[i], idx_val = induction_index[i];
6503 : : return val; */
6504 : :
6505 : 69 : tree data_eltype = TREE_TYPE (vectype);
6506 : 69 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6507 : 69 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6508 : 69 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6509 : : /* Enforced by vectorizable_reduction, which ensures we have target
6510 : : support before allowing a conditional reduction on variable-length
6511 : : vectors. */
6512 : 69 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6513 : 69 : tree idx_val = NULL_TREE, val = NULL_TREE;
6514 : 461 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6515 : : {
6516 : 392 : tree old_idx_val = idx_val;
6517 : 392 : tree old_val = val;
6518 : 392 : idx_val = make_ssa_name (idx_eltype);
6519 : 392 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6520 : : build3 (BIT_FIELD_REF, idx_eltype,
6521 : : induction_index,
6522 : 392 : bitsize_int (el_size),
6523 : 392 : bitsize_int (off)));
6524 : 392 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6525 : 392 : val = make_ssa_name (data_eltype);
6526 : 784 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6527 : : build3 (BIT_FIELD_REF,
6528 : : data_eltype,
6529 : 392 : reduc_inputs[0],
6530 : 392 : bitsize_int (el_size),
6531 : 392 : bitsize_int (off)));
6532 : 392 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6533 : 392 : if (off != 0)
6534 : : {
6535 : 323 : tree new_idx_val = idx_val;
6536 : 323 : if (off != v_size - el_size)
6537 : : {
6538 : 254 : new_idx_val = make_ssa_name (idx_eltype);
6539 : 254 : epilog_stmt = gimple_build_assign (new_idx_val,
6540 : : MAX_EXPR, idx_val,
6541 : : old_idx_val);
6542 : 254 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6543 : : }
6544 : 323 : tree cond = make_ssa_name (boolean_type_node);
6545 : 323 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6546 : : idx_val, old_idx_val);
6547 : 323 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6548 : 323 : tree new_val = make_ssa_name (data_eltype);
6549 : 323 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6550 : : cond, val, old_val);
6551 : 323 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6552 : 323 : idx_val = new_idx_val;
6553 : 323 : val = new_val;
6554 : : }
6555 : : }
6556 : : /* Convert the reduced value back to the result type and set as the
6557 : : result. */
6558 : 69 : gimple_seq stmts = NULL;
6559 : 69 : val = gimple_convert (&stmts, scalar_type, val);
6560 : 69 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6561 : 69 : scalar_results.safe_push (val);
6562 : 69 : }
6563 : :
6564 : : /* 2.3 Create the reduction code, using one of the three schemes described
6565 : : above. In SLP we simply need to extract all the elements from the
6566 : : vector (without reducing them), so we use scalar shifts. */
6567 : 20801 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
6568 : : {
6569 : 19126 : tree tmp;
6570 : 19126 : tree vec_elem_type;
6571 : :
6572 : : /* Case 1: Create:
6573 : : v_out2 = reduc_expr <v_out1> */
6574 : :
6575 : 19126 : if (dump_enabled_p ())
6576 : 1210 : dump_printf_loc (MSG_NOTE, vect_location,
6577 : : "Reduce using direct vector reduction.\n");
6578 : :
6579 : 19126 : gimple_seq stmts = NULL;
6580 : 19126 : vec_elem_type = TREE_TYPE (vectype);
6581 : 19126 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6582 : 19126 : vec_elem_type, reduc_inputs[0]);
6583 : 19126 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6584 : 19126 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6585 : :
6586 : 19126 : if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6587 : 66 : && induc_val)
6588 : : {
6589 : : /* Earlier we set the initial value to be a vector if induc_val
6590 : : values. Check the result and if it is induc_val then replace
6591 : : with the original initial value, unless induc_val is
6592 : : the same as initial_def already. */
6593 : 63 : tree zcompare = make_ssa_name (boolean_type_node);
6594 : 63 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6595 : : new_temp, induc_val);
6596 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6597 : 63 : tree initial_def = reduc_info->reduc_initial_values[0];
6598 : 63 : tmp = make_ssa_name (new_scalar_dest);
6599 : 63 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6600 : : initial_def, new_temp);
6601 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6602 : 63 : new_temp = tmp;
6603 : : }
6604 : :
6605 : 19126 : scalar_results.safe_push (new_temp);
6606 : 19126 : }
6607 : 1547 : else if (direct_slp_reduc)
6608 : : {
6609 : : /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6610 : : with the elements for other SLP statements replaced with the
6611 : : neutral value. We can then do a normal reduction on each vector. */
6612 : :
6613 : : /* Enforced by vectorizable_reduction. */
6614 : : gcc_assert (reduc_inputs.length () == 1);
6615 : : gcc_assert (pow2p_hwi (group_size));
6616 : :
6617 : : gimple_seq seq = NULL;
6618 : :
6619 : : /* Build a vector {0, 1, 2, ...}, with the same number of elements
6620 : : and the same element size as VECTYPE. */
6621 : : tree index = build_index_vector (vectype, 0, 1);
6622 : : tree index_type = TREE_TYPE (index);
6623 : : tree index_elt_type = TREE_TYPE (index_type);
6624 : : tree mask_type = truth_type_for (index_type);
6625 : :
6626 : : /* Create a vector that, for each element, identifies which of
6627 : : the REDUC_GROUP_SIZE results should use it. */
6628 : : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6629 : : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6630 : : build_vector_from_val (index_type, index_mask));
6631 : :
6632 : : /* Get a neutral vector value. This is simply a splat of the neutral
6633 : : scalar value if we have one, otherwise the initial scalar value
6634 : : is itself a neutral value. */
6635 : : tree vector_identity = NULL_TREE;
6636 : : tree neutral_op = NULL_TREE;
6637 : : if (slp_node)
6638 : : {
6639 : : tree initial_value = NULL_TREE;
6640 : : if (REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (reduc_info)))
6641 : : initial_value = reduc_info->reduc_initial_values[0];
6642 : : neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6643 : : initial_value, false);
6644 : : }
6645 : : if (neutral_op)
6646 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
6647 : : neutral_op);
6648 : : for (unsigned int i = 0; i < group_size; ++i)
6649 : : {
6650 : : /* If there's no univeral neutral value, we can use the
6651 : : initial scalar value from the original PHI. This is used
6652 : : for MIN and MAX reduction, for example. */
6653 : : if (!neutral_op)
6654 : : {
6655 : : tree scalar_value = reduc_info->reduc_initial_values[i];
6656 : : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6657 : : scalar_value);
6658 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
6659 : : scalar_value);
6660 : : }
6661 : :
6662 : : /* Calculate the equivalent of:
6663 : :
6664 : : sel[j] = (index[j] == i);
6665 : :
6666 : : which selects the elements of REDUC_INPUTS[0] that should
6667 : : be included in the result. */
6668 : : tree compare_val = build_int_cst (index_elt_type, i);
6669 : : compare_val = build_vector_from_val (index_type, compare_val);
6670 : : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6671 : : index, compare_val);
6672 : :
6673 : : /* Calculate the equivalent of:
6674 : :
6675 : : vec = seq ? reduc_inputs[0] : vector_identity;
6676 : :
6677 : : VEC is now suitable for a full vector reduction. */
6678 : : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6679 : : sel, reduc_inputs[0], vector_identity);
6680 : :
6681 : : /* Do the reduction and convert it to the appropriate type. */
6682 : : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6683 : : TREE_TYPE (vectype), vec);
6684 : : scalar = gimple_convert (&seq, scalar_type, scalar);
6685 : : scalar_results.safe_push (scalar);
6686 : : }
6687 : : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6688 : : }
6689 : : else
6690 : : {
6691 : 1547 : bool reduce_with_shift;
6692 : 1547 : tree vec_temp;
6693 : :
6694 : 1547 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6695 : :
6696 : : /* See if the target wants to do the final (shift) reduction
6697 : : in a vector mode of smaller size and first reduce upper/lower
6698 : : halves against each other. */
6699 : 1675 : enum machine_mode mode1 = mode;
6700 : 1675 : tree stype = TREE_TYPE (vectype);
6701 : 1675 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6702 : 1675 : unsigned nunits1 = nunits;
6703 : 1675 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6704 : 1675 : && reduc_inputs.length () == 1)
6705 : : {
6706 : 37 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6707 : : /* For SLP reductions we have to make sure lanes match up, but
6708 : : since we're doing individual element final reduction reducing
6709 : : vector width here is even more important.
6710 : : ??? We can also separate lanes with permutes, for the common
6711 : : case of power-of-two group-size odd/even extracts would work. */
6712 : 37 : if (slp_reduc && nunits != nunits1)
6713 : : {
6714 : 37 : nunits1 = least_common_multiple (nunits1, group_size);
6715 : 74 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6716 : : }
6717 : : }
6718 : 1675 : if (!slp_reduc
6719 : 1675 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6720 : 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6721 : :
6722 : 1675 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6723 : : stype, nunits1);
6724 : 1675 : reduce_with_shift = have_whole_vector_shift (mode1);
6725 : 661 : if (!VECTOR_MODE_P (mode1)
6726 : 2336 : || !directly_supported_p (code, vectype1))
6727 : : reduce_with_shift = false;
6728 : :
6729 : : /* First reduce the vector to the desired vector size we should
6730 : : do shift reduction on by combining upper and lower halves. */
6731 : 1675 : gimple_seq stmts = NULL;
6732 : 1675 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6733 : : code, &stmts);
6734 : 1675 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6735 : 1675 : reduc_inputs[0] = new_temp;
6736 : :
6737 : 1675 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
6738 : : {
6739 : 1504 : int element_bitsize = tree_to_uhwi (bitsize);
6740 : : /* Enforced by vectorizable_reduction, which disallows SLP reductions
6741 : : for variable-length vectors and also requires direct target support
6742 : : for loop reductions. */
6743 : 1504 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6744 : 1504 : int nelements = vec_size_in_bits / element_bitsize;
6745 : 1504 : vec_perm_builder sel;
6746 : 1504 : vec_perm_indices indices;
6747 : :
6748 : 1504 : int elt_offset;
6749 : :
6750 : 1504 : tree zero_vec = build_zero_cst (vectype1);
6751 : : /* Case 2: Create:
6752 : : for (offset = nelements/2; offset >= 1; offset/=2)
6753 : : {
6754 : : Create: va' = vec_shift <va, offset>
6755 : : Create: va = vop <va, va'>
6756 : : } */
6757 : :
6758 : 1504 : tree rhs;
6759 : :
6760 : 1504 : if (dump_enabled_p ())
6761 : 303 : dump_printf_loc (MSG_NOTE, vect_location,
6762 : : "Reduce using vector shifts\n");
6763 : :
6764 : 1504 : gimple_seq stmts = NULL;
6765 : 1504 : new_temp = gimple_convert (&stmts, vectype1, new_temp);
6766 : 1504 : for (elt_offset = nelements / 2;
6767 : 3274 : elt_offset >= 1;
6768 : 1770 : elt_offset /= 2)
6769 : : {
6770 : 1770 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6771 : 1770 : indices.new_vector (sel, 2, nelements);
6772 : 1770 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6773 : 1770 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6774 : : new_temp, zero_vec, mask);
6775 : 1770 : new_temp = gimple_build (&stmts, code,
6776 : : vectype1, new_name, new_temp);
6777 : : }
6778 : 1504 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6779 : :
6780 : : /* 2.4 Extract the final scalar result. Create:
6781 : : s_out3 = extract_field <v_out2, bitpos> */
6782 : :
6783 : 1504 : if (dump_enabled_p ())
6784 : 303 : dump_printf_loc (MSG_NOTE, vect_location,
6785 : : "extract scalar result\n");
6786 : :
6787 : 1504 : rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6788 : : bitsize, bitsize_zero_node);
6789 : 1504 : epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6790 : 1504 : new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6791 : 1504 : gimple_assign_set_lhs (epilog_stmt, new_temp);
6792 : 1504 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6793 : 1504 : scalar_results.safe_push (new_temp);
6794 : 1504 : }
6795 : : else
6796 : : {
6797 : : /* Case 3: Create:
6798 : : s = extract_field <v_out2, 0>
6799 : : for (offset = element_size;
6800 : : offset < vector_size;
6801 : : offset += element_size;)
6802 : : {
6803 : : Create: s' = extract_field <v_out2, offset>
6804 : : Create: s = op <s, s'> // For non SLP cases
6805 : : } */
6806 : :
6807 : 171 : if (dump_enabled_p ())
6808 : 106 : dump_printf_loc (MSG_NOTE, vect_location,
6809 : : "Reduce using scalar code.\n");
6810 : :
6811 : 171 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6812 : 171 : int element_bitsize = tree_to_uhwi (bitsize);
6813 : 171 : tree compute_type = TREE_TYPE (vectype);
6814 : 171 : gimple_seq stmts = NULL;
6815 : 385 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6816 : : {
6817 : 214 : int bit_offset;
6818 : 214 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6819 : : vec_temp, bitsize, bitsize_zero_node);
6820 : :
6821 : : /* In SLP we don't need to apply reduction operation, so we just
6822 : : collect s' values in SCALAR_RESULTS. */
6823 : 214 : if (slp_reduc)
6824 : 204 : scalar_results.safe_push (new_temp);
6825 : :
6826 : 487 : for (bit_offset = element_bitsize;
6827 : 701 : bit_offset < vec_size_in_bits;
6828 : 487 : bit_offset += element_bitsize)
6829 : : {
6830 : 487 : tree bitpos = bitsize_int (bit_offset);
6831 : 487 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6832 : : compute_type, vec_temp,
6833 : : bitsize, bitpos);
6834 : 487 : if (slp_reduc)
6835 : : {
6836 : : /* In SLP we don't need to apply reduction operation, so
6837 : : we just collect s' values in SCALAR_RESULTS. */
6838 : 477 : new_temp = new_name;
6839 : 477 : scalar_results.safe_push (new_name);
6840 : : }
6841 : : else
6842 : 10 : new_temp = gimple_build (&stmts, code, compute_type,
6843 : : new_name, new_temp);
6844 : : }
6845 : : }
6846 : :
6847 : : /* The only case where we need to reduce scalar results in SLP, is
6848 : : unrolling. If the size of SCALAR_RESULTS is greater than
6849 : : REDUC_GROUP_SIZE, we reduce them combining elements modulo
6850 : : REDUC_GROUP_SIZE. */
6851 : 171 : if (slp_reduc)
6852 : : {
6853 : 161 : tree res, first_res, new_res;
6854 : :
6855 : : /* Reduce multiple scalar results in case of SLP unrolling. */
6856 : 410 : for (j = group_size; scalar_results.iterate (j, &res);
6857 : : j++)
6858 : : {
6859 : 249 : first_res = scalar_results[j % group_size];
6860 : 249 : new_res = gimple_build (&stmts, code, compute_type,
6861 : : first_res, res);
6862 : 249 : scalar_results[j % group_size] = new_res;
6863 : : }
6864 : 161 : scalar_results.truncate (group_size);
6865 : 754 : for (k = 0; k < group_size; k++)
6866 : 864 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6867 : 432 : scalar_results[k]);
6868 : : }
6869 : : else
6870 : : {
6871 : : /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6872 : 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6873 : 10 : scalar_results.safe_push (new_temp);
6874 : : }
6875 : :
6876 : 171 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6877 : : }
6878 : :
6879 : 1675 : if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6880 : 0 : && induc_val)
6881 : : {
6882 : : /* Earlier we set the initial value to be a vector if induc_val
6883 : : values. Check the result and if it is induc_val then replace
6884 : : with the original initial value, unless induc_val is
6885 : : the same as initial_def already. */
6886 : 0 : tree zcompare = make_ssa_name (boolean_type_node);
6887 : 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6888 : 0 : scalar_results[0], induc_val);
6889 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6890 : 0 : tree initial_def = reduc_info->reduc_initial_values[0];
6891 : 0 : tree tmp = make_ssa_name (new_scalar_dest);
6892 : 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6893 : 0 : initial_def, scalar_results[0]);
6894 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6895 : 0 : scalar_results[0] = tmp;
6896 : : }
6897 : : }
6898 : :
6899 : : /* 2.5 Adjust the final result by the initial value of the reduction
6900 : : variable. (When such adjustment is not needed, then
6901 : : 'adjustment_def' is zero). For example, if code is PLUS we create:
6902 : : new_temp = loop_exit_def + adjustment_def */
6903 : :
6904 : 20874 : if (adjustment_def)
6905 : : {
6906 : 15685 : gcc_assert (!slp_reduc || group_size == 1);
6907 : 15685 : gimple_seq stmts = NULL;
6908 : 15685 : if (double_reduc)
6909 : : {
6910 : 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6911 : 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6912 : 0 : new_temp = gimple_build (&stmts, code, vectype,
6913 : 0 : reduc_inputs[0], adjustment_def);
6914 : : }
6915 : : else
6916 : : {
6917 : 15685 : new_temp = scalar_results[0];
6918 : 15685 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6919 : 15685 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6920 : : adjustment_def);
6921 : 15685 : new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6922 : 15685 : new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6923 : : new_temp, adjustment_def);
6924 : 15685 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6925 : : }
6926 : :
6927 : 15685 : epilog_stmt = gimple_seq_last_stmt (stmts);
6928 : 15685 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6929 : 15685 : scalar_results[0] = new_temp;
6930 : : }
6931 : :
6932 : : /* Record this operation if it could be reused by the epilogue loop. */
6933 : 20874 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6934 : 20874 : && reduc_inputs.length () == 1)
6935 : 20701 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6936 : : { orig_reduc_input, reduc_info });
6937 : :
6938 : 20874 : if (double_reduc)
6939 : 66 : loop = outer_loop;
6940 : :
6941 : : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6942 : : phis with new adjusted scalar results, i.e., replace use <s_out0>
6943 : : with use <s_out4>.
6944 : :
6945 : : Transform:
6946 : : loop_exit:
6947 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6948 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6949 : : v_out2 = reduce <v_out1>
6950 : : s_out3 = extract_field <v_out2, 0>
6951 : : s_out4 = adjust_result <s_out3>
6952 : : use <s_out0>
6953 : : use <s_out0>
6954 : :
6955 : : into:
6956 : :
6957 : : loop_exit:
6958 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6959 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6960 : : v_out2 = reduce <v_out1>
6961 : : s_out3 = extract_field <v_out2, 0>
6962 : : s_out4 = adjust_result <s_out3>
6963 : : use <s_out4>
6964 : : use <s_out4> */
6965 : :
6966 : 41748 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6967 : 20874 : auto_vec<gimple *> phis;
6968 : 42019 : for (k = 0; k < live_out_stmts.size (); k++)
6969 : : {
6970 : 21145 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6971 : 21145 : scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6972 : :
6973 : : /* Find the loop-closed-use at the loop exit of the original scalar
6974 : : result. (The reduction result is expected to have two immediate uses,
6975 : : one at the latch block, and one at the loop exit). For double
6976 : : reductions we are looking for exit phis of the outer loop. */
6977 : 87398 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6978 : : {
6979 : 66253 : if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6980 : : {
6981 : 21140 : if (!is_gimple_debug (USE_STMT (use_p))
6982 : 21140 : && gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6983 : 21132 : phis.safe_push (USE_STMT (use_p));
6984 : : }
6985 : : else
6986 : : {
6987 : 45113 : if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6988 : : {
6989 : 66 : tree phi_res = PHI_RESULT (USE_STMT (use_p));
6990 : :
6991 : 132 : FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6992 : : {
6993 : 66 : if (!flow_bb_inside_loop_p (loop,
6994 : 66 : gimple_bb (USE_STMT (phi_use_p)))
6995 : 66 : && !is_gimple_debug (USE_STMT (phi_use_p)))
6996 : 0 : phis.safe_push (USE_STMT (phi_use_p));
6997 : : }
6998 : : }
6999 : : }
7000 : : }
7001 : :
7002 : 42277 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
7003 : : {
7004 : : /* Replace the uses: */
7005 : 21132 : orig_name = PHI_RESULT (exit_phi);
7006 : :
7007 : : /* Look for a single use at the target of the skip edge. */
7008 : 21132 : if (unify_with_main_loop_p)
7009 : : {
7010 : 21 : use_operand_p use_p;
7011 : 21 : gimple *user;
7012 : 21 : if (!single_imm_use (orig_name, &use_p, &user))
7013 : 0 : gcc_unreachable ();
7014 : 21 : orig_name = gimple_get_lhs (user);
7015 : : }
7016 : :
7017 : 21132 : scalar_result = scalar_results[k];
7018 : 57767 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
7019 : : {
7020 : 109917 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7021 : 36641 : SET_USE (use_p, scalar_result);
7022 : 36635 : update_stmt (use_stmt);
7023 : 21132 : }
7024 : : }
7025 : :
7026 : 21145 : phis.truncate (0);
7027 : : }
7028 : 20874 : }
7029 : :
7030 : : /* Return a vector of type VECTYPE that is equal to the vector select
7031 : : operation "MASK ? VEC : IDENTITY". Insert the select statements
7032 : : before GSI. */
7033 : :
7034 : : static tree
7035 : 0 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
7036 : : tree vec, tree identity)
7037 : : {
7038 : 0 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
7039 : 0 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
7040 : : mask, vec, identity);
7041 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7042 : 0 : return cond;
7043 : : }
7044 : :
7045 : : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
7046 : : order, starting with LHS. Insert the extraction statements before GSI and
7047 : : associate the new scalar SSA names with variable SCALAR_DEST.
7048 : : If MASK is nonzero mask the input and then operate on it unconditionally.
7049 : : Return the SSA name for the result. */
7050 : :
7051 : : static tree
7052 : 1125 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
7053 : : tree_code code, tree lhs, tree vector_rhs,
7054 : : tree mask)
7055 : : {
7056 : 1125 : tree vectype = TREE_TYPE (vector_rhs);
7057 : 1125 : tree scalar_type = TREE_TYPE (vectype);
7058 : 1125 : tree bitsize = TYPE_SIZE (scalar_type);
7059 : 1125 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
7060 : 1125 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7061 : :
7062 : : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7063 : : to perform an unconditional element-wise reduction of it. */
7064 : 1125 : if (mask)
7065 : : {
7066 : 11 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7067 : : "masked_vector_rhs");
7068 : 11 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7069 : : false);
7070 : 11 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
7071 : 11 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7072 : : mask, vector_rhs, vector_identity);
7073 : 11 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
7074 : 11 : vector_rhs = masked_vector_rhs;
7075 : : }
7076 : :
7077 : 1125 : for (unsigned HOST_WIDE_INT bit_offset = 0;
7078 : 4625 : bit_offset < vec_size_in_bits;
7079 : 3500 : bit_offset += element_bitsize)
7080 : : {
7081 : 3500 : tree bitpos = bitsize_int (bit_offset);
7082 : 3500 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7083 : : bitsize, bitpos);
7084 : :
7085 : 3500 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7086 : 3500 : rhs = make_ssa_name (scalar_dest, stmt);
7087 : 3500 : gimple_assign_set_lhs (stmt, rhs);
7088 : 3500 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7089 : :
7090 : 3500 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7091 : 3500 : tree new_name = make_ssa_name (scalar_dest, stmt);
7092 : 3500 : gimple_assign_set_lhs (stmt, new_name);
7093 : 3500 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7094 : 3500 : lhs = new_name;
7095 : : }
7096 : 1125 : return lhs;
7097 : : }
7098 : :
7099 : : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
7100 : : type of the vector input. */
7101 : :
7102 : : static internal_fn
7103 : 892 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7104 : : {
7105 : 892 : internal_fn mask_reduc_fn;
7106 : 892 : internal_fn mask_len_reduc_fn;
7107 : :
7108 : 892 : switch (reduc_fn)
7109 : : {
7110 : 0 : case IFN_FOLD_LEFT_PLUS:
7111 : 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7112 : 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7113 : 0 : break;
7114 : :
7115 : : default:
7116 : : return IFN_LAST;
7117 : : }
7118 : :
7119 : 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7120 : : OPTIMIZE_FOR_SPEED))
7121 : : return mask_reduc_fn;
7122 : 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7123 : : OPTIMIZE_FOR_SPEED))
7124 : : return mask_len_reduc_fn;
7125 : : return IFN_LAST;
7126 : : }
7127 : :
7128 : : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7129 : : statement that sets the live-out value. REDUC_DEF_STMT is the phi
7130 : : statement. CODE is the operation performed by STMT_INFO and OPS are
7131 : : its scalar operands. REDUC_INDEX is the index of the operand in
7132 : : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7133 : : implements in-order reduction, or IFN_LAST if we should open-code it.
7134 : : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7135 : : that should be used to control the operation in a fully-masked loop. */
7136 : :
7137 : : static bool
7138 |