Branch data Line data Source code
1 : : /* Loop Vectorization
2 : : Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : : Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #define INCLUDE_ALGORITHM
23 : : #include "config.h"
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "cfghooks.h"
32 : : #include "tree-pass.h"
33 : : #include "ssa.h"
34 : : #include "optabs-tree.h"
35 : : #include "memmodel.h"
36 : : #include "optabs.h"
37 : : #include "diagnostic-core.h"
38 : : #include "fold-const.h"
39 : : #include "stor-layout.h"
40 : : #include "cfganal.h"
41 : : #include "gimplify.h"
42 : : #include "gimple-iterator.h"
43 : : #include "gimplify-me.h"
44 : : #include "tree-ssa-loop-ivopts.h"
45 : : #include "tree-ssa-loop-manip.h"
46 : : #include "tree-ssa-loop-niter.h"
47 : : #include "tree-ssa-loop.h"
48 : : #include "cfgloop.h"
49 : : #include "tree-scalar-evolution.h"
50 : : #include "tree-vectorizer.h"
51 : : #include "gimple-fold.h"
52 : : #include "cgraph.h"
53 : : #include "tree-cfg.h"
54 : : #include "tree-if-conv.h"
55 : : #include "internal-fn.h"
56 : : #include "tree-vector-builder.h"
57 : : #include "vec-perm-indices.h"
58 : : #include "tree-eh.h"
59 : : #include "case-cfn-macros.h"
60 : : #include "langhooks.h"
61 : :
62 : : /* Loop Vectorization Pass.
63 : :
64 : : This pass tries to vectorize loops.
65 : :
66 : : For example, the vectorizer transforms the following simple loop:
67 : :
68 : : short a[N]; short b[N]; short c[N]; int i;
69 : :
70 : : for (i=0; i<N; i++){
71 : : a[i] = b[i] + c[i];
72 : : }
73 : :
74 : : as if it was manually vectorized by rewriting the source code into:
75 : :
76 : : typedef int __attribute__((mode(V8HI))) v8hi;
77 : : short a[N]; short b[N]; short c[N]; int i;
78 : : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 : : v8hi va, vb, vc;
80 : :
81 : : for (i=0; i<N/8; i++){
82 : : vb = pb[i];
83 : : vc = pc[i];
84 : : va = vb + vc;
85 : : pa[i] = va;
86 : : }
87 : :
88 : : The main entry to this pass is vectorize_loops(), in which
89 : : the vectorizer applies a set of analyses on a given set of loops,
90 : : followed by the actual vectorization transformation for the loops that
91 : : had successfully passed the analysis phase.
92 : : Throughout this pass we make a distinction between two types of
93 : : data: scalars (which are represented by SSA_NAMES), and memory references
94 : : ("data-refs"). These two types of data require different handling both
95 : : during analysis and transformation. The types of data-refs that the
96 : : vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 : : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 : : accesses are required to have a simple (consecutive) access pattern.
99 : :
100 : : Analysis phase:
101 : : ===============
102 : : The driver for the analysis phase is vect_analyze_loop().
103 : : It applies a set of analyses, some of which rely on the scalar evolution
104 : : analyzer (scev) developed by Sebastian Pop.
105 : :
106 : : During the analysis phase the vectorizer records some information
107 : : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 : : loop, as well as general information about the loop as a whole, which is
109 : : recorded in a "loop_vec_info" struct attached to each loop.
110 : :
111 : : Transformation phase:
112 : : =====================
113 : : The loop transformation phase scans all the stmts in the loop, and
114 : : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 : : the loop that needs to be vectorized. It inserts the vector code sequence
116 : : just before the scalar stmt S, and records a pointer to the vector code
117 : : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 : : attached to S). This pointer will be used for the vectorization of following
119 : : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 : : otherwise, we rely on dead code elimination for removing it.
121 : :
122 : : For example, say stmt S1 was vectorized into stmt VS1:
123 : :
124 : : VS1: vb = px[i];
125 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 : : S2: a = b;
127 : :
128 : : To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 : : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 : : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 : : resulting sequence would be:
132 : :
133 : : VS1: vb = px[i];
134 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 : : VS2: va = vb;
136 : : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
137 : :
138 : : Operands that are not SSA_NAMEs, are data-refs that appear in
139 : : load/store operations (like 'x[i]' in S1), and are handled differently.
140 : :
141 : : Target modeling:
142 : : =================
143 : : Currently the only target specific information that is used is the
144 : : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 : : Targets that can support different sizes of vectors, for now will need
146 : : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 : : flexibility will be added in the future.
148 : :
149 : : Since we only vectorize operations which vector form can be
150 : : expressed using existing tree codes, to verify that an operation is
151 : : supported, the vectorizer checks the relevant optab at the relevant
152 : : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 : : the value found is CODE_FOR_nothing, then there's no target support, and
154 : : we can't vectorize the stmt.
155 : :
156 : : For additional information on this project see:
157 : : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
158 : : */
159 : :
160 : : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 : : unsigned *);
162 : : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 : : bool *, bool *, bool);
164 : :
165 : : /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 : : statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 : : may already be set for general statements (not just data refs). */
168 : :
169 : : static opt_result
170 : 2271113 : vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 : : bool vectype_maybe_set_p,
172 : : poly_uint64 *vf)
173 : : {
174 : 2271113 : gimple *stmt = stmt_info->stmt;
175 : :
176 : 2271113 : if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 : 1126375 : && !STMT_VINFO_LIVE_P (stmt_info))
178 : 2271203 : || gimple_clobber_p (stmt))
179 : : {
180 : 1126285 : if (dump_enabled_p ())
181 : 113813 : dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 : 1126285 : return opt_result::success ();
183 : : }
184 : :
185 : 1144828 : tree stmt_vectype, nunits_vectype;
186 : 1144828 : opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 : : &stmt_vectype,
188 : : &nunits_vectype);
189 : 1144828 : if (!res)
190 : 2334 : return res;
191 : :
192 : 1142494 : if (stmt_vectype)
193 : : {
194 : 1142480 : if (STMT_VINFO_VECTYPE (stmt_info))
195 : : /* The only case when a vectype had been already set is for stmts
196 : : that contain a data ref, or for "pattern-stmts" (stmts generated
197 : : by the vectorizer to represent/replace a certain idiom). */
198 : 641953 : gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 : : || vectype_maybe_set_p)
200 : : && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 : : else
202 : 500527 : STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
203 : : }
204 : :
205 : 1142494 : if (nunits_vectype)
206 : 1142480 : vect_update_max_nunits (vf, nunits_vectype);
207 : :
208 : 1142494 : return opt_result::success ();
209 : : }
210 : :
211 : : /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 : : types of STMT_INFO and all attached pattern statements and update
213 : : the vectorization factor VF accordingly. Return true on success
214 : : or false if something prevented vectorization. */
215 : :
216 : : static opt_result
217 : 1991397 : vect_determine_vf_for_stmt (vec_info *vinfo,
218 : : stmt_vec_info stmt_info, poly_uint64 *vf)
219 : : {
220 : 1991397 : if (dump_enabled_p ())
221 : 203148 : dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 : : stmt_info->stmt);
223 : 1991397 : opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 : 1991397 : if (!res)
225 : 2334 : return res;
226 : :
227 : 1989063 : if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 : 134663 : && STMT_VINFO_RELATED_STMT (stmt_info))
229 : : {
230 : 134663 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 : 134663 : stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
232 : :
233 : : /* If a pattern statement has def stmts, analyze them too. */
234 : 134663 : for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 : 279716 : !gsi_end_p (si); gsi_next (&si))
236 : : {
237 : 145053 : stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 : 145053 : if (dump_enabled_p ())
239 : 18956 : dump_printf_loc (MSG_NOTE, vect_location,
240 : : "==> examining pattern def stmt: %G",
241 : : def_stmt_info->stmt);
242 : 145053 : res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 : 145053 : if (!res)
244 : 0 : return res;
245 : : }
246 : :
247 : 134663 : if (dump_enabled_p ())
248 : 15570 : dump_printf_loc (MSG_NOTE, vect_location,
249 : : "==> examining pattern statement: %G",
250 : : stmt_info->stmt);
251 : 134663 : res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 : 134663 : if (!res)
253 : 0 : return res;
254 : : }
255 : :
256 : 1989063 : return opt_result::success ();
257 : : }
258 : :
259 : : /* Function vect_determine_vectorization_factor
260 : :
261 : : Determine the vectorization factor (VF). VF is the number of data elements
262 : : that are operated upon in parallel in a single iteration of the vectorized
263 : : loop. For example, when vectorizing a loop that operates on 4byte elements,
264 : : on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 : : elements can fit in a single vector register.
266 : :
267 : : We currently support vectorization of loops in which all types operated upon
268 : : are of the same size. Therefore this function currently sets VF according to
269 : : the size of the types operated upon, and fails if there are multiple sizes
270 : : in the loop.
271 : :
272 : : VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 : : original loop:
274 : : for (i=0; i<N; i++){
275 : : a[i] = b[i] + c[i];
276 : : }
277 : :
278 : : vectorized loop:
279 : : for (i=0; i<N; i+=VF){
280 : : a[i:VF] = b[i:VF] + c[i:VF];
281 : : }
282 : : */
283 : :
284 : : static opt_result
285 : 204462 : vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
286 : : {
287 : 204462 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 : 204462 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 : 204462 : unsigned nbbs = loop->num_nodes;
290 : 204462 : poly_uint64 vectorization_factor = 1;
291 : 204462 : tree scalar_type = NULL_TREE;
292 : 204462 : gphi *phi;
293 : 204462 : tree vectype;
294 : 204462 : stmt_vec_info stmt_info;
295 : 204462 : unsigned i;
296 : :
297 : 204462 : DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
298 : :
299 : 641647 : for (i = 0; i < nbbs; i++)
300 : : {
301 : 444154 : basic_block bb = bbs[i];
302 : :
303 : 994060 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 : 549906 : gsi_next (&si))
305 : : {
306 : 554541 : phi = si.phi ();
307 : 554541 : stmt_info = loop_vinfo->lookup_stmt (phi);
308 : 554541 : if (dump_enabled_p ())
309 : 51582 : dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 : : (gimple *) phi);
311 : :
312 : 554541 : gcc_assert (stmt_info);
313 : :
314 : 554541 : if (STMT_VINFO_RELEVANT_P (stmt_info)
315 : 375817 : || STMT_VINFO_LIVE_P (stmt_info))
316 : : {
317 : 178724 : gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 : 178724 : scalar_type = TREE_TYPE (PHI_RESULT (phi));
319 : :
320 : 178724 : if (dump_enabled_p ())
321 : 10195 : dump_printf_loc (MSG_NOTE, vect_location,
322 : : "get vectype for scalar type: %T\n",
323 : : scalar_type);
324 : :
325 : 178724 : vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 : 178724 : if (!vectype)
327 : 4635 : return opt_result::failure_at (phi,
328 : : "not vectorized: unsupported "
329 : : "data-type %T\n",
330 : : scalar_type);
331 : 174089 : STMT_VINFO_VECTYPE (stmt_info) = vectype;
332 : :
333 : 174089 : if (dump_enabled_p ())
334 : 10137 : dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 : : vectype);
336 : :
337 : 174089 : if (dump_enabled_p ())
338 : : {
339 : 10137 : dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 : 10137 : dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 : 10137 : dump_printf (MSG_NOTE, "\n");
342 : : }
343 : :
344 : 174089 : vect_update_max_nunits (&vectorization_factor, vectype);
345 : : }
346 : : }
347 : :
348 : 3468283 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 : 2589245 : gsi_next (&si))
350 : : {
351 : 2591579 : if (is_gimple_debug (gsi_stmt (si)))
352 : 600182 : continue;
353 : 1991397 : stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 : 1991397 : opt_result res
355 : 1991397 : = vect_determine_vf_for_stmt (loop_vinfo,
356 : : stmt_info, &vectorization_factor);
357 : 1991397 : if (!res)
358 : 2334 : return res;
359 : : }
360 : : }
361 : :
362 : : /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 : 197493 : if (dump_enabled_p ())
364 : : {
365 : 16871 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 : 16871 : dump_dec (MSG_NOTE, vectorization_factor);
367 : 16871 : dump_printf (MSG_NOTE, "\n");
368 : : }
369 : :
370 : 197493 : if (known_le (vectorization_factor, 1U))
371 : 26842 : return opt_result::failure_at (vect_location,
372 : : "not vectorized: unsupported data-type\n");
373 : 170651 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 : 170651 : return opt_result::success ();
375 : : }
376 : :
377 : :
378 : : /* Function vect_is_simple_iv_evolution.
379 : :
380 : : FORNOW: A simple evolution of an induction variables in the loop is
381 : : considered a polynomial evolution. */
382 : :
383 : : static bool
384 : 583470 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 : : tree * step)
386 : : {
387 : 583470 : tree init_expr;
388 : 583470 : tree step_expr;
389 : 583470 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 : 583470 : basic_block bb;
391 : :
392 : : /* When there is no evolution in this loop, the evolution function
393 : : is not "simple". */
394 : 583470 : if (evolution_part == NULL_TREE)
395 : : return false;
396 : :
397 : : /* When the evolution is a polynomial of degree >= 2
398 : : the evolution function is not "simple". */
399 : 630338 : if (tree_is_chrec (evolution_part))
400 : : return false;
401 : :
402 : 534634 : step_expr = evolution_part;
403 : 534634 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
404 : :
405 : 534634 : if (dump_enabled_p ())
406 : 39012 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 : : step_expr, init_expr);
408 : :
409 : 534634 : *init = init_expr;
410 : 534634 : *step = step_expr;
411 : :
412 : 534634 : if (TREE_CODE (step_expr) != INTEGER_CST
413 : 52082 : && (TREE_CODE (step_expr) != SSA_NAME
414 : 48506 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 : 48274 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 : 5236 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 : 117 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 : 117 : || !flag_associative_math)))
419 : 581567 : && (TREE_CODE (step_expr) != REAL_CST
420 : 567 : || !flag_associative_math))
421 : : {
422 : 46868 : if (dump_enabled_p ())
423 : 2927 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 : : "step unknown.\n");
425 : 46868 : return false;
426 : : }
427 : :
428 : : return true;
429 : : }
430 : :
431 : : /* Function vect_is_nonlinear_iv_evolution
432 : :
433 : : Only support nonlinear induction for integer type
434 : : 1. neg
435 : : 2. mul by constant
436 : : 3. lshift/rshift by constant.
437 : :
438 : : For neg induction, return a fake step as integer -1. */
439 : : static bool
440 : 93854 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 : : gphi* loop_phi_node, tree *init, tree *step)
442 : : {
443 : 93854 : tree init_expr, ev_expr, result, op1, op2;
444 : 93854 : gimple* def;
445 : :
446 : 93854 : if (gimple_phi_num_args (loop_phi_node) != 2)
447 : : return false;
448 : :
449 : 93854 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 : 93854 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
451 : :
452 : : /* Support nonlinear induction only for integer type. */
453 : 93854 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 : : return false;
455 : :
456 : 69662 : *init = init_expr;
457 : 69662 : result = PHI_RESULT (loop_phi_node);
458 : :
459 : 69662 : if (TREE_CODE (ev_expr) != SSA_NAME
460 : 66615 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 : 69662 : || !is_gimple_assign (def))
462 : : return false;
463 : :
464 : 62507 : enum tree_code t_code = gimple_assign_rhs_code (def);
465 : 62507 : switch (t_code)
466 : : {
467 : 1541 : case NEGATE_EXPR:
468 : 1541 : if (gimple_assign_rhs1 (def) != result)
469 : : return false;
470 : 1541 : *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 : 1541 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 : 1541 : break;
473 : :
474 : 7775 : case RSHIFT_EXPR:
475 : 7775 : case LSHIFT_EXPR:
476 : 7775 : case MULT_EXPR:
477 : 7775 : op1 = gimple_assign_rhs1 (def);
478 : 7775 : op2 = gimple_assign_rhs2 (def);
479 : 7775 : if (TREE_CODE (op2) != INTEGER_CST
480 : 4473 : || op1 != result)
481 : : return false;
482 : 4359 : *step = op2;
483 : 4359 : if (t_code == LSHIFT_EXPR)
484 : 160 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 : 4199 : else if (t_code == RSHIFT_EXPR)
486 : 3537 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 : : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 : : else
489 : 662 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 : : break;
491 : :
492 : : default:
493 : : return false;
494 : : }
495 : :
496 : 5900 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 : 5900 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
498 : :
499 : 5900 : return true;
500 : : }
501 : :
502 : : /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 : : what we are assuming is a double reduction. For example, given
504 : : a structure like this:
505 : :
506 : : outer1:
507 : : x_1 = PHI <x_4(outer2), ...>;
508 : : ...
509 : :
510 : : inner:
511 : : x_2 = PHI <x_1(outer1), ...>;
512 : : ...
513 : : x_3 = ...;
514 : : ...
515 : :
516 : : outer2:
517 : : x_4 = PHI <x_3(inner)>;
518 : : ...
519 : :
520 : : outer loop analysis would treat x_1 as a double reduction phi and
521 : : this function would then return true for x_2. */
522 : :
523 : : static bool
524 : 584011 : vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
525 : : {
526 : 584011 : use_operand_p use_p;
527 : 584011 : ssa_op_iter op_iter;
528 : 1751454 : FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 : 1167984 : if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 : 582082 : if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 : : return true;
532 : : return false;
533 : : }
534 : :
535 : : /* Returns true if Phi is a first-order recurrence. A first-order
536 : : recurrence is a non-reduction recurrence relation in which the value of
537 : : the recurrence in the current loop iteration equals a value defined in
538 : : the previous iteration. */
539 : :
540 : : static bool
541 : 20105 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 : : gphi *phi)
543 : : {
544 : : /* A nested cycle isn't vectorizable as first order recurrence. */
545 : 20105 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 : : return false;
547 : :
548 : : /* Ensure the loop latch definition is from within the loop. */
549 : 19979 : edge latch = loop_latch_edge (loop);
550 : 19979 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 : 19979 : if (TREE_CODE (ldef) != SSA_NAME
552 : 16664 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 : 16636 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 : 35564 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 : 4707 : return false;
556 : :
557 : 15272 : tree def = gimple_phi_result (phi);
558 : :
559 : : /* Ensure every use_stmt of the phi node is dominated by the latch
560 : : definition. */
561 : 15272 : imm_use_iterator imm_iter;
562 : 15272 : use_operand_p use_p;
563 : 16603 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 : 16285 : if (!is_gimple_debug (USE_STMT (use_p))
565 : 32378 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 : 10565 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 : : USE_STMT (use_p))))
568 : 14954 : return false;
569 : :
570 : : /* First-order recurrence autovectorization needs shuffle vector. */
571 : 318 : tree scalar_type = TREE_TYPE (def);
572 : 318 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 : 318 : if (!vectype)
574 : : return false;
575 : :
576 : : return true;
577 : : }
578 : :
579 : : /* Function vect_analyze_scalar_cycles_1.
580 : :
581 : : Examine the cross iteration def-use cycles of scalar variables
582 : : in LOOP. LOOP_VINFO represents the loop that is now being
583 : : considered for vectorization (can be LOOP, or an outer-loop
584 : : enclosing LOOP). SLP indicates there will be some subsequent
585 : : slp analyses or not. */
586 : :
587 : : static void
588 : 288859 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 : : bool slp)
590 : : {
591 : 288859 : basic_block bb = loop->header;
592 : 288859 : tree init, step;
593 : 288859 : auto_vec<stmt_vec_info, 64> worklist;
594 : 288859 : gphi_iterator gsi;
595 : 288859 : bool double_reduc, reduc_chain;
596 : :
597 : 288859 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
598 : :
599 : : /* First - identify all inductions. Reduction detection assumes that all the
600 : : inductions have been identified, therefore, this order must not be
601 : : changed. */
602 : 1040020 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
603 : : {
604 : 751161 : gphi *phi = gsi.phi ();
605 : 751161 : tree access_fn = NULL;
606 : 751161 : tree def = PHI_RESULT (phi);
607 : 751161 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
608 : :
609 : 751161 : if (dump_enabled_p ())
610 : 55787 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 : : (gimple *) phi);
612 : :
613 : : /* Skip virtual phi's. The data dependences that are associated with
614 : : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 : 1502322 : if (virtual_operand_p (def))
616 : 257506 : continue;
617 : :
618 : 584011 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
619 : :
620 : : /* Analyze the evolution function. */
621 : 584011 : access_fn = analyze_scalar_evolution (loop, def);
622 : 584011 : if (access_fn)
623 : : {
624 : 584011 : STRIP_NOPS (access_fn);
625 : 584011 : if (dump_enabled_p ())
626 : 40946 : dump_printf_loc (MSG_NOTE, vect_location,
627 : : "Access function of PHI: %T\n", access_fn);
628 : 584011 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 : 584011 : = initial_condition_in_loop_num (access_fn, loop->num);
630 : 584011 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 : 584011 : = evolution_part_in_loop_num (access_fn, loop->num);
632 : : }
633 : :
634 : 674367 : if ((!access_fn
635 : 584011 : || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 : 583470 : || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 : : &init, &step)
638 : 487766 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 : 8277 : && TREE_CODE (step) != INTEGER_CST))
640 : : /* Only handle nonlinear iv for same loop. */
641 : 680267 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 : 93854 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 : : phi, &init, &step)))
644 : : {
645 : 90356 : worklist.safe_push (stmt_vinfo);
646 : 90356 : continue;
647 : : }
648 : :
649 : 493655 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 : : != NULL_TREE);
651 : 493655 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
652 : :
653 : 493655 : if (dump_enabled_p ())
654 : 36202 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 : 493655 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
656 : : }
657 : :
658 : :
659 : : /* Second - identify all reductions and nested cycles. */
660 : 379215 : while (worklist.length () > 0)
661 : : {
662 : 90356 : stmt_vec_info stmt_vinfo = worklist.pop ();
663 : 90356 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 : 90356 : tree def = PHI_RESULT (phi);
665 : :
666 : 90356 : if (dump_enabled_p ())
667 : 4744 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 : : (gimple *) phi);
669 : :
670 : 180712 : gcc_assert (!virtual_operand_p (def)
671 : : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
672 : :
673 : 90356 : stmt_vec_info reduc_stmt_info
674 : 90356 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 : 90356 : &reduc_chain, slp);
676 : 90356 : if (reduc_stmt_info)
677 : : {
678 : 70251 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 : 70251 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 : 70251 : if (double_reduc)
681 : : {
682 : 541 : if (dump_enabled_p ())
683 : 126 : dump_printf_loc (MSG_NOTE, vect_location,
684 : : "Detected double reduction.\n");
685 : :
686 : 541 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 : 541 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
688 : : }
689 : : else
690 : : {
691 : 69710 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
692 : : {
693 : 2276 : if (dump_enabled_p ())
694 : 461 : dump_printf_loc (MSG_NOTE, vect_location,
695 : : "Detected vectorizable nested cycle.\n");
696 : :
697 : 2276 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
698 : : }
699 : : else
700 : : {
701 : 67434 : if (dump_enabled_p ())
702 : 3668 : dump_printf_loc (MSG_NOTE, vect_location,
703 : : "Detected reduction.\n");
704 : :
705 : 67434 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 : 67434 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 : : /* Store the reduction cycles for possible vectorization in
708 : : loop-aware SLP if it was not detected as reduction
709 : : chain. */
710 : 67434 : if (! reduc_chain)
711 : 66640 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 : 66640 : (reduc_stmt_info);
713 : : }
714 : : }
715 : : }
716 : 20105 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 : 312 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 : : else
719 : 19793 : if (dump_enabled_p ())
720 : 426 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 : : "Unknown def-use cycle pattern.\n");
722 : : }
723 : 288859 : }
724 : :
725 : :
726 : : /* Function vect_analyze_scalar_cycles.
727 : :
728 : : Examine the cross iteration def-use cycles of scalar variables, by
729 : : analyzing the loop-header PHIs of scalar variables. Classify each
730 : : cycle as one of the following: invariant, induction, reduction, unknown.
731 : : We do that for the loop represented by LOOP_VINFO, and also to its
732 : : inner-loop, if exists.
733 : : Examples for scalar cycles:
734 : :
735 : : Example1: reduction:
736 : :
737 : : loop1:
738 : : for (i=0; i<N; i++)
739 : : sum += a[i];
740 : :
741 : : Example2: induction:
742 : :
743 : : loop2:
744 : : for (i=0; i<N; i++)
745 : : a[i] = i; */
746 : :
747 : : static void
748 : 284647 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
749 : : {
750 : 284647 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
751 : :
752 : 284647 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
753 : :
754 : : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 : : Reductions in such inner-loop therefore have different properties than
756 : : the reductions in the nest that gets vectorized:
757 : : 1. When vectorized, they are executed in the same order as in the original
758 : : scalar loop, so we can't change the order of computation when
759 : : vectorizing them.
760 : : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 : : current checks are too strict. */
762 : :
763 : 284647 : if (loop->inner)
764 : 4212 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
765 : 284647 : }
766 : :
767 : : /* Transfer group and reduction information from STMT_INFO to its
768 : : pattern stmt. */
769 : :
770 : : static void
771 : 17 : vect_fixup_reduc_chain (stmt_vec_info stmt_info)
772 : : {
773 : 17 : stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 : 17 : stmt_vec_info stmtp;
775 : 17 : gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 : : && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 : 17 : REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
778 : 150 : do
779 : : {
780 : 150 : stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 : 150 : gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 : : == STMT_VINFO_DEF_TYPE (stmt_info));
783 : 150 : REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 : 150 : stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 : 150 : if (stmt_info)
786 : 133 : REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 : 133 : = STMT_VINFO_RELATED_STMT (stmt_info);
788 : : }
789 : 150 : while (stmt_info);
790 : 17 : }
791 : :
792 : : /* Fixup scalar cycles that now have their stmts detected as patterns. */
793 : :
794 : : static void
795 : 284647 : vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
796 : : {
797 : 284647 : stmt_vec_info first;
798 : 284647 : unsigned i;
799 : :
800 : 285441 : FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
801 : : {
802 : 794 : stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 : 3459 : while (next)
804 : : {
805 : 2682 : if ((STMT_VINFO_IN_PATTERN_P (next)
806 : 2682 : != STMT_VINFO_IN_PATTERN_P (first))
807 : 5347 : || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 : : break;
809 : 2665 : next = REDUC_GROUP_NEXT_ELEMENT (next);
810 : : }
811 : : /* If all reduction chain members are well-formed patterns adjust
812 : : the group to group the pattern stmts instead. */
813 : 794 : if (! next
814 : 811 : && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
815 : : {
816 : 777 : if (STMT_VINFO_IN_PATTERN_P (first))
817 : : {
818 : 17 : vect_fixup_reduc_chain (first);
819 : 34 : LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 : 17 : = STMT_VINFO_RELATED_STMT (first);
821 : : }
822 : : }
823 : : /* If not all stmt in the chain are patterns or if we failed
824 : : to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 : : it as regular reduction instead. */
826 : : else
827 : : {
828 : : stmt_vec_info vinfo = first;
829 : : stmt_vec_info last = NULL;
830 : 67 : while (vinfo)
831 : : {
832 : 50 : next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 : 50 : REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 : 50 : REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 : 50 : last = vinfo;
836 : 50 : vinfo = next;
837 : : }
838 : 17 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 : 17 : = vect_internal_def;
840 : 18 : loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 : 17 : LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 : 17 : --i;
843 : : }
844 : : }
845 : 284647 : }
846 : :
847 : : /* Function vect_get_loop_niters.
848 : :
849 : : Determine how many iterations the loop is executed and place it
850 : : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 : : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 : : niter information holds in ASSUMPTIONS.
853 : :
854 : : Return the loop exit conditions. */
855 : :
856 : :
857 : : static vec<gcond *>
858 : 218678 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 : : tree *number_of_iterations, tree *number_of_iterationsm1)
860 : : {
861 : 218678 : auto_vec<edge> exits = get_loop_exit_edges (loop);
862 : 218678 : vec<gcond *> conds;
863 : 437356 : conds.create (exits.length ());
864 : 218678 : class tree_niter_desc niter_desc;
865 : 218678 : tree niter_assumptions, niter, may_be_zero;
866 : :
867 : 218678 : *assumptions = boolean_true_node;
868 : 218678 : *number_of_iterationsm1 = chrec_dont_know;
869 : 218678 : *number_of_iterations = chrec_dont_know;
870 : :
871 : 218678 : DUMP_VECT_SCOPE ("get_loop_niters");
872 : :
873 : 218678 : if (exits.is_empty ())
874 : 0 : return conds;
875 : :
876 : 218678 : if (dump_enabled_p ())
877 : 13271 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 : : exits.length ());
879 : :
880 : : edge exit;
881 : : unsigned int i;
882 : 522705 : FOR_EACH_VEC_ELT (exits, i, exit)
883 : : {
884 : 304027 : gcond *cond = get_loop_exit_condition (exit);
885 : 304027 : if (cond)
886 : 295104 : conds.safe_push (cond);
887 : :
888 : 304027 : if (dump_enabled_p ())
889 : 14177 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
890 : :
891 : 304027 : if (exit != main_exit)
892 : 118290 : continue;
893 : :
894 : 218678 : may_be_zero = NULL_TREE;
895 : 218678 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 : 218678 : || chrec_contains_undetermined (niter_desc.niter))
897 : 32941 : continue;
898 : :
899 : 185737 : niter_assumptions = niter_desc.assumptions;
900 : 185737 : may_be_zero = niter_desc.may_be_zero;
901 : 185737 : niter = niter_desc.niter;
902 : :
903 : 185737 : if (may_be_zero && integer_zerop (may_be_zero))
904 : : may_be_zero = NULL_TREE;
905 : :
906 : 11915 : if (may_be_zero)
907 : : {
908 : 11915 : if (COMPARISON_CLASS_P (may_be_zero))
909 : : {
910 : : /* Try to combine may_be_zero with assumptions, this can simplify
911 : : computation of niter expression. */
912 : 11915 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 : 989 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 : : niter_assumptions,
915 : : fold_build1 (TRUTH_NOT_EXPR,
916 : : boolean_type_node,
917 : : may_be_zero));
918 : : else
919 : 10926 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 : : build_int_cst (TREE_TYPE (niter), 0),
921 : : rewrite_to_non_trapping_overflow (niter));
922 : :
923 : 185737 : may_be_zero = NULL_TREE;
924 : : }
925 : 0 : else if (integer_nonzerop (may_be_zero))
926 : : {
927 : 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 : 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 : 0 : continue;
930 : : }
931 : : else
932 : 0 : continue;
933 : : }
934 : :
935 : : /* Loop assumptions are based off the normal exit. */
936 : 185737 : *assumptions = niter_assumptions;
937 : 185737 : *number_of_iterationsm1 = niter;
938 : :
939 : : /* We want the number of loop header executions which is the number
940 : : of latch executions plus one.
941 : : ??? For UINT_MAX latch executions this number overflows to zero
942 : : for loops like do { n++; } while (n != 0); */
943 : 185737 : if (niter && !chrec_contains_undetermined (niter))
944 : : {
945 : 185737 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
946 : : unshare_expr (niter),
947 : : build_int_cst (TREE_TYPE (niter), 1));
948 : 185737 : if (TREE_CODE (niter) == INTEGER_CST
949 : 98196 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
950 : : {
951 : : /* If we manage to fold niter + 1 into INTEGER_CST even when
952 : : niter is some complex expression, ensure back
953 : : *number_of_iterationsm1 is an INTEGER_CST as well. See
954 : : PR113210. */
955 : 4 : *number_of_iterationsm1
956 : 4 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
957 : : build_minus_one_cst (TREE_TYPE (niter)));
958 : : }
959 : : }
960 : 185737 : *number_of_iterations = niter;
961 : : }
962 : :
963 : 218678 : if (dump_enabled_p ())
964 : 13271 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
965 : :
966 : 218678 : return conds;
967 : 218678 : }
968 : :
969 : : /* Determine the main loop exit for the vectorizer. */
970 : :
971 : : edge
972 : 395716 : vec_init_loop_exit_info (class loop *loop)
973 : : {
974 : : /* Before we begin we must first determine which exit is the main one and
975 : : which are auxilary exits. */
976 : 395716 : auto_vec<edge> exits = get_loop_exit_edges (loop);
977 : 395716 : if (exits.length () == 1)
978 : 247240 : return exits[0];
979 : :
980 : : /* If we have multiple exits we only support counting IV at the moment.
981 : : Analyze all exits and return the last one we can analyze. */
982 : 148476 : class tree_niter_desc niter_desc;
983 : 148476 : edge candidate = NULL;
984 : 1013476 : for (edge exit : exits)
985 : : {
986 : 577546 : if (!get_loop_exit_condition (exit))
987 : 168645 : continue;
988 : :
989 : 408901 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
990 : 408901 : && !chrec_contains_undetermined (niter_desc.niter))
991 : : {
992 : 112722 : tree may_be_zero = niter_desc.may_be_zero;
993 : 112722 : if ((integer_zerop (may_be_zero)
994 : : /* As we are handling may_be_zero that's not false by
995 : : rewriting niter to may_be_zero ? 0 : niter we require
996 : : an empty latch. */
997 : 589907 : || (single_pred_p (loop->latch)
998 : 11996 : && exit->src == single_pred (loop->latch)
999 : 4456 : && (integer_nonzerop (may_be_zero)
1000 : 4456 : || COMPARISON_CLASS_P (may_be_zero))))
1001 : 117178 : && (!candidate
1002 : 3931 : || dominated_by_p (CDI_DOMINATORS, exit->src,
1003 : 3931 : candidate->src)))
1004 : : candidate = exit;
1005 : : }
1006 : : }
1007 : :
1008 : 148476 : return candidate;
1009 : 148476 : }
1010 : :
1011 : : /* Function bb_in_loop_p
1012 : :
1013 : : Used as predicate for dfs order traversal of the loop bbs. */
1014 : :
1015 : : static bool
1016 : 1099790 : bb_in_loop_p (const_basic_block bb, const void *data)
1017 : : {
1018 : 1099790 : const class loop *const loop = (const class loop *)data;
1019 : 1099790 : if (flow_bb_inside_loop_p (loop, bb))
1020 : : return true;
1021 : : return false;
1022 : : }
1023 : :
1024 : :
1025 : : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1026 : : stmt_vec_info structs for all the stmts in LOOP_IN. */
1027 : :
1028 : 368786 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1029 : : : vec_info (vec_info::loop, shared),
1030 : 368786 : loop (loop_in),
1031 : 368786 : bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1032 : 368786 : num_itersm1 (NULL_TREE),
1033 : 368786 : num_iters (NULL_TREE),
1034 : 368786 : num_iters_unchanged (NULL_TREE),
1035 : 368786 : num_iters_assumptions (NULL_TREE),
1036 : 368786 : vector_costs (nullptr),
1037 : 368786 : scalar_costs (nullptr),
1038 : 368786 : th (0),
1039 : 368786 : versioning_threshold (0),
1040 : 368786 : vectorization_factor (0),
1041 : 368786 : main_loop_edge (nullptr),
1042 : 368786 : skip_main_loop_edge (nullptr),
1043 : 368786 : skip_this_loop_edge (nullptr),
1044 : 368786 : reusable_accumulators (),
1045 : 368786 : suggested_unroll_factor (1),
1046 : 368786 : max_vectorization_factor (0),
1047 : 368786 : mask_skip_niters (NULL_TREE),
1048 : 368786 : rgroup_compare_type (NULL_TREE),
1049 : 368786 : simd_if_cond (NULL_TREE),
1050 : 368786 : partial_vector_style (vect_partial_vectors_none),
1051 : 368786 : unaligned_dr (NULL),
1052 : 368786 : peeling_for_alignment (0),
1053 : 368786 : ptr_mask (0),
1054 : 368786 : ivexpr_map (NULL),
1055 : 368786 : scan_map (NULL),
1056 : 368786 : slp_unrolling_factor (1),
1057 : 368786 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1058 : 368786 : vectorizable (false),
1059 : 368786 : can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1060 : 368786 : using_partial_vectors_p (false),
1061 : 368786 : using_decrementing_iv_p (false),
1062 : 368786 : using_select_vl_p (false),
1063 : 368786 : epil_using_partial_vectors_p (false),
1064 : 368786 : partial_load_store_bias (0),
1065 : 368786 : peeling_for_gaps (false),
1066 : 368786 : peeling_for_niter (false),
1067 : 368786 : early_breaks (false),
1068 : 368786 : no_data_dependencies (false),
1069 : 368786 : has_mask_store (false),
1070 : 368786 : scalar_loop_scaling (profile_probability::uninitialized ()),
1071 : 368786 : scalar_loop (NULL),
1072 : 368786 : orig_loop_info (NULL),
1073 : 368786 : vec_loop_iv_exit (NULL),
1074 : 368786 : vec_epilogue_loop_iv_exit (NULL),
1075 : 737572 : scalar_loop_iv_exit (NULL)
1076 : : {
1077 : : /* CHECKME: We want to visit all BBs before their successors (except for
1078 : : latch blocks, for which this assertion wouldn't hold). In the simple
1079 : : case of the loop forms we allow, a dfs order of the BBs would the same
1080 : : as reversed postorder traversal, so we are safe. */
1081 : :
1082 : 368786 : unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1083 : 368786 : bbs, loop->num_nodes, loop);
1084 : 368786 : gcc_assert (nbbs == loop->num_nodes);
1085 : :
1086 : 1297081 : for (unsigned int i = 0; i < nbbs; i++)
1087 : : {
1088 : 928295 : basic_block bb = bbs[i];
1089 : 928295 : gimple_stmt_iterator si;
1090 : :
1091 : 1924325 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1092 : : {
1093 : 996030 : gimple *phi = gsi_stmt (si);
1094 : 996030 : gimple_set_uid (phi, 0);
1095 : 996030 : add_stmt (phi);
1096 : : }
1097 : :
1098 : 7566043 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1099 : : {
1100 : 5709453 : gimple *stmt = gsi_stmt (si);
1101 : 5709453 : gimple_set_uid (stmt, 0);
1102 : 5709453 : if (is_gimple_debug (stmt))
1103 : 1778172 : continue;
1104 : 3931281 : add_stmt (stmt);
1105 : : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1106 : : third argument is the #pragma omp simd if (x) condition, when 0,
1107 : : loop shouldn't be vectorized, when non-zero constant, it should
1108 : : be vectorized normally, otherwise versioned with vectorized loop
1109 : : done if the condition is non-zero at runtime. */
1110 : 3931281 : if (loop_in->simduid
1111 : 45268 : && is_gimple_call (stmt)
1112 : 4471 : && gimple_call_internal_p (stmt)
1113 : 4305 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1114 : 4305 : && gimple_call_num_args (stmt) >= 3
1115 : 104 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1116 : 3931385 : && (loop_in->simduid
1117 : 104 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1118 : : {
1119 : 104 : tree arg = gimple_call_arg (stmt, 2);
1120 : 104 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1121 : 104 : simd_if_cond = arg;
1122 : : else
1123 : 0 : gcc_assert (integer_nonzerop (arg));
1124 : : }
1125 : : }
1126 : : }
1127 : :
1128 : 368786 : epilogue_vinfos.create (6);
1129 : 368786 : }
1130 : :
1131 : : /* Free all levels of rgroup CONTROLS. */
1132 : :
1133 : : void
1134 : 739312 : release_vec_loop_controls (vec<rgroup_controls> *controls)
1135 : : {
1136 : 739312 : rgroup_controls *rgc;
1137 : 739312 : unsigned int i;
1138 : 739328 : FOR_EACH_VEC_ELT (*controls, i, rgc)
1139 : 16 : rgc->controls.release ();
1140 : 739312 : controls->release ();
1141 : 739312 : }
1142 : :
1143 : : /* Free all memory used by the _loop_vec_info, as well as all the
1144 : : stmt_vec_info structs of all the stmts in the loop. */
1145 : :
1146 : 368785 : _loop_vec_info::~_loop_vec_info ()
1147 : : {
1148 : 368785 : free (bbs);
1149 : :
1150 : 368785 : release_vec_loop_controls (&masks.rgc_vec);
1151 : 368785 : release_vec_loop_controls (&lens);
1152 : 372382 : delete ivexpr_map;
1153 : 369141 : delete scan_map;
1154 : 368785 : epilogue_vinfos.release ();
1155 : 368785 : delete scalar_costs;
1156 : 368785 : delete vector_costs;
1157 : :
1158 : : /* When we release an epiloge vinfo that we do not intend to use
1159 : : avoid clearing AUX of the main loop which should continue to
1160 : : point to the main loop vinfo since otherwise we'll leak that. */
1161 : 368785 : if (loop->aux == this)
1162 : 45298 : loop->aux = NULL;
1163 : 737570 : }
1164 : :
1165 : : /* Return an invariant or register for EXPR and emit necessary
1166 : : computations in the LOOP_VINFO loop preheader. */
1167 : :
1168 : : tree
1169 : 20034 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1170 : : {
1171 : 20034 : if (is_gimple_reg (expr)
1172 : 20034 : || is_gimple_min_invariant (expr))
1173 : 6604 : return expr;
1174 : :
1175 : 13430 : if (! loop_vinfo->ivexpr_map)
1176 : 3597 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1177 : 13430 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1178 : 13430 : if (! cached)
1179 : : {
1180 : 8352 : gimple_seq stmts = NULL;
1181 : 8352 : cached = force_gimple_operand (unshare_expr (expr),
1182 : : &stmts, true, NULL_TREE);
1183 : 8352 : if (stmts)
1184 : : {
1185 : 8219 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1186 : 8219 : gsi_insert_seq_on_edge_immediate (e, stmts);
1187 : : }
1188 : : }
1189 : 13430 : return cached;
1190 : : }
1191 : :
1192 : : /* Return true if we can use CMP_TYPE as the comparison type to produce
1193 : : all masks required to mask LOOP_VINFO. */
1194 : :
1195 : : static bool
1196 : 50 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1197 : : {
1198 : 50 : rgroup_controls *rgm;
1199 : 50 : unsigned int i;
1200 : 53 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1201 : 53 : if (rgm->type != NULL_TREE
1202 : 53 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1203 : : cmp_type, rgm->type,
1204 : : OPTIMIZE_FOR_SPEED))
1205 : : return false;
1206 : : return true;
1207 : : }
1208 : :
1209 : : /* Calculate the maximum number of scalars per iteration for every
1210 : : rgroup in LOOP_VINFO. */
1211 : :
1212 : : static unsigned int
1213 : 12 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1214 : : {
1215 : 12 : unsigned int res = 1;
1216 : 12 : unsigned int i;
1217 : 12 : rgroup_controls *rgm;
1218 : 25 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1219 : 13 : res = MAX (res, rgm->max_nscalars_per_iter);
1220 : 12 : return res;
1221 : : }
1222 : :
1223 : : /* Calculate the minimum precision necessary to represent:
1224 : :
1225 : : MAX_NITERS * FACTOR
1226 : :
1227 : : as an unsigned integer, where MAX_NITERS is the maximum number of
1228 : : loop header iterations for the original scalar form of LOOP_VINFO. */
1229 : :
1230 : : static unsigned
1231 : 12 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1232 : : {
1233 : 12 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1234 : :
1235 : : /* Get the maximum number of iterations that is representable
1236 : : in the counter type. */
1237 : 12 : tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1238 : 12 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1239 : :
1240 : : /* Get a more refined estimate for the number of iterations. */
1241 : 12 : widest_int max_back_edges;
1242 : 12 : if (max_loop_iterations (loop, &max_back_edges))
1243 : 12 : max_ni = wi::smin (max_ni, max_back_edges + 1);
1244 : :
1245 : : /* Work out how many bits we need to represent the limit. */
1246 : 12 : return wi::min_precision (max_ni * factor, UNSIGNED);
1247 : 12 : }
1248 : :
1249 : : /* True if the loop needs peeling or partial vectors when vectorized. */
1250 : :
1251 : : static bool
1252 : 110798 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1253 : : {
1254 : 110798 : unsigned HOST_WIDE_INT const_vf;
1255 : 110798 : HOST_WIDE_INT max_niter
1256 : 110798 : = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1257 : :
1258 : 110798 : unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1259 : 110798 : if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1260 : 23032 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1261 : : (loop_vinfo));
1262 : :
1263 : 110798 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1264 : 50176 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1265 : : {
1266 : : /* Work out the (constant) number of iterations that need to be
1267 : : peeled for reasons other than niters. */
1268 : 50167 : unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1269 : 50167 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1270 : 993 : peel_niter += 1;
1271 : 109629 : if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1272 : 50167 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1273 : : return true;
1274 : : }
1275 : 60631 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1276 : : /* ??? When peeling for gaps but not alignment, we could
1277 : : try to check whether the (variable) niters is known to be
1278 : : VF * N + 1. That's something of a niche case though. */
1279 : 60582 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1280 : 58876 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1281 : 119507 : || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1282 : 117752 : < (unsigned) exact_log2 (const_vf))
1283 : : /* In case of versioning, check if the maximum number of
1284 : : iterations is greater than th. If they are identical,
1285 : : the epilogue is unnecessary. */
1286 : 57729 : && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1287 : 3236 : || ((unsigned HOST_WIDE_INT) max_niter
1288 : : /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1289 : : but that's only computed later based on our result.
1290 : : The following is the most conservative approximation. */
1291 : 4405 : > (std::max ((unsigned HOST_WIDE_INT) th,
1292 : 3236 : const_vf) / const_vf) * const_vf))))
1293 : 59462 : return true;
1294 : :
1295 : : return false;
1296 : : }
1297 : :
1298 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1299 : : whether we can actually generate the masks required. Return true if so,
1300 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1301 : :
1302 : : static bool
1303 : 12 : vect_verify_full_masking (loop_vec_info loop_vinfo)
1304 : : {
1305 : 12 : unsigned int min_ni_width;
1306 : :
1307 : : /* Use a normal loop if there are no statements that need masking.
1308 : : This only happens in rare degenerate cases: it means that the loop
1309 : : has no loads, no stores, and no live-out values. */
1310 : 12 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1311 : : return false;
1312 : :
1313 : : /* Produce the rgroup controls. */
1314 : 36 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1315 : : {
1316 : 12 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1317 : 12 : tree vectype = mask.first;
1318 : 12 : unsigned nvectors = mask.second;
1319 : :
1320 : 12 : if (masks->rgc_vec.length () < nvectors)
1321 : 12 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1322 : 12 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1323 : : /* The number of scalars per iteration and the number of vectors are
1324 : : both compile-time constants. */
1325 : 12 : unsigned int nscalars_per_iter
1326 : 12 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1327 : 12 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1328 : :
1329 : 12 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1330 : : {
1331 : 12 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1332 : 12 : rgm->type = truth_type_for (vectype);
1333 : 12 : rgm->factor = 1;
1334 : : }
1335 : : }
1336 : :
1337 : 12 : unsigned int max_nscalars_per_iter
1338 : 12 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1339 : :
1340 : : /* Work out how many bits we need to represent the limit. */
1341 : 12 : min_ni_width
1342 : 12 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1343 : :
1344 : : /* Find a scalar mode for which WHILE_ULT is supported. */
1345 : 12 : opt_scalar_int_mode cmp_mode_iter;
1346 : 12 : tree cmp_type = NULL_TREE;
1347 : 12 : tree iv_type = NULL_TREE;
1348 : 12 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1349 : 12 : unsigned int iv_precision = UINT_MAX;
1350 : :
1351 : 12 : if (iv_limit != -1)
1352 : 12 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1353 : : UNSIGNED);
1354 : :
1355 : 96 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1356 : : {
1357 : 84 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1358 : 84 : if (cmp_bits >= min_ni_width
1359 : 84 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1360 : : {
1361 : 50 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1362 : 50 : if (this_type
1363 : 50 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1364 : : {
1365 : : /* Although we could stop as soon as we find a valid mode,
1366 : : there are at least two reasons why that's not always the
1367 : : best choice:
1368 : :
1369 : : - An IV that's Pmode or wider is more likely to be reusable
1370 : : in address calculations than an IV that's narrower than
1371 : : Pmode.
1372 : :
1373 : : - Doing the comparison in IV_PRECISION or wider allows
1374 : : a natural 0-based IV, whereas using a narrower comparison
1375 : : type requires mitigations against wrap-around.
1376 : :
1377 : : Conversely, if the IV limit is variable, doing the comparison
1378 : : in a wider type than the original type can introduce
1379 : : unnecessary extensions, so picking the widest valid mode
1380 : : is not always a good choice either.
1381 : :
1382 : : Here we prefer the first IV type that's Pmode or wider,
1383 : : and the first comparison type that's IV_PRECISION or wider.
1384 : : (The comparison type must be no wider than the IV type,
1385 : : to avoid extensions in the vector loop.)
1386 : :
1387 : : ??? We might want to try continuing beyond Pmode for ILP32
1388 : : targets if CMP_BITS < IV_PRECISION. */
1389 : 0 : iv_type = this_type;
1390 : 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1391 : : cmp_type = this_type;
1392 : 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1393 : : break;
1394 : : }
1395 : : }
1396 : : }
1397 : :
1398 : 12 : if (!cmp_type)
1399 : : {
1400 : 12 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1401 : 12 : return false;
1402 : : }
1403 : :
1404 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1405 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1406 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1407 : 0 : return true;
1408 : 12 : }
1409 : :
1410 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1411 : : whether we can actually generate AVX512 style masks. Return true if so,
1412 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1413 : :
1414 : : static bool
1415 : 12 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1416 : : {
1417 : : /* Produce differently organized rgc_vec and differently check
1418 : : we can produce masks. */
1419 : :
1420 : : /* Use a normal loop if there are no statements that need masking.
1421 : : This only happens in rare degenerate cases: it means that the loop
1422 : : has no loads, no stores, and no live-out values. */
1423 : 12 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1424 : : return false;
1425 : :
1426 : : /* For the decrementing IV we need to represent all values in
1427 : : [0, niter + niter_skip] where niter_skip is the elements we
1428 : : skip in the first iteration for prologue peeling. */
1429 : 12 : tree iv_type = NULL_TREE;
1430 : 12 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1431 : 12 : unsigned int iv_precision = UINT_MAX;
1432 : 12 : if (iv_limit != -1)
1433 : 12 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1434 : :
1435 : : /* First compute the type for the IV we use to track the remaining
1436 : : scalar iterations. */
1437 : 12 : opt_scalar_int_mode cmp_mode_iter;
1438 : 22 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1439 : : {
1440 : 22 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1441 : 22 : if (cmp_bits >= iv_precision
1442 : 22 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1443 : : {
1444 : 12 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1445 : 12 : if (iv_type)
1446 : : break;
1447 : : }
1448 : : }
1449 : 12 : if (!iv_type)
1450 : : return false;
1451 : :
1452 : : /* Produce the rgroup controls. */
1453 : 36 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1454 : : {
1455 : 12 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1456 : 12 : tree vectype = mask.first;
1457 : 12 : unsigned nvectors = mask.second;
1458 : :
1459 : : /* The number of scalars per iteration and the number of vectors are
1460 : : both compile-time constants. */
1461 : 12 : unsigned int nscalars_per_iter
1462 : 12 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1463 : 12 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1464 : :
1465 : : /* We index the rgroup_controls vector with nscalars_per_iter
1466 : : which we keep constant and instead have a varying nvectors,
1467 : : remembering the vector mask with the fewest nV. */
1468 : 12 : if (masks->rgc_vec.length () < nscalars_per_iter)
1469 : 12 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1470 : 12 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1471 : :
1472 : 12 : if (!rgm->type || rgm->factor > nvectors)
1473 : : {
1474 : 12 : rgm->type = truth_type_for (vectype);
1475 : 12 : rgm->compare_type = NULL_TREE;
1476 : 12 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1477 : 12 : rgm->factor = nvectors;
1478 : 12 : rgm->bias_adjusted_ctrl = NULL_TREE;
1479 : : }
1480 : : }
1481 : :
1482 : : /* There is no fixed compare type we are going to use but we have to
1483 : : be able to get at one for each mask group. */
1484 : 12 : unsigned int min_ni_width
1485 : 12 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1486 : :
1487 : 12 : bool ok = true;
1488 : 50 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1489 : : {
1490 : 16 : tree mask_type = rgc.type;
1491 : 16 : if (!mask_type)
1492 : 4 : continue;
1493 : :
1494 : : /* For now vect_get_loop_mask only supports integer mode masks
1495 : : when we need to split it. */
1496 : 12 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1497 : 12 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1498 : : {
1499 : : ok = false;
1500 : : break;
1501 : : }
1502 : :
1503 : : /* If iv_type is usable as compare type use that - we can elide the
1504 : : saturation in that case. */
1505 : 10 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1506 : : {
1507 : 10 : tree cmp_vectype
1508 : 10 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1509 : 10 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1510 : 1 : rgc.compare_type = cmp_vectype;
1511 : : }
1512 : 10 : if (!rgc.compare_type)
1513 : 28 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1514 : : {
1515 : 28 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1516 : 28 : if (cmp_bits >= min_ni_width
1517 : 28 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1518 : : {
1519 : 28 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1520 : 28 : if (!cmp_type)
1521 : 0 : continue;
1522 : :
1523 : : /* Check whether we can produce the mask with cmp_type. */
1524 : 28 : tree cmp_vectype
1525 : 28 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1526 : 28 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1527 : : {
1528 : 9 : rgc.compare_type = cmp_vectype;
1529 : 9 : break;
1530 : : }
1531 : : }
1532 : : }
1533 : 10 : if (!rgc.compare_type)
1534 : : {
1535 : : ok = false;
1536 : : break;
1537 : : }
1538 : : }
1539 : 12 : if (!ok)
1540 : : {
1541 : 2 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1542 : 2 : return false;
1543 : : }
1544 : :
1545 : 10 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1546 : 10 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1547 : 10 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1548 : 10 : return true;
1549 : 12 : }
1550 : :
1551 : : /* Check whether we can use vector access with length based on precison
1552 : : comparison. So far, to keep it simple, we only allow the case that the
1553 : : precision of the target supported length is larger than the precision
1554 : : required by loop niters. */
1555 : :
1556 : : static bool
1557 : 0 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1558 : : {
1559 : 0 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1560 : : return false;
1561 : :
1562 : 0 : machine_mode len_load_mode, len_store_mode;
1563 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1564 : 0 : .exists (&len_load_mode))
1565 : : return false;
1566 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1567 : 0 : .exists (&len_store_mode))
1568 : : return false;
1569 : :
1570 : 0 : signed char partial_load_bias = internal_len_load_store_bias
1571 : 0 : (IFN_LEN_LOAD, len_load_mode);
1572 : :
1573 : 0 : signed char partial_store_bias = internal_len_load_store_bias
1574 : 0 : (IFN_LEN_STORE, len_store_mode);
1575 : :
1576 : 0 : gcc_assert (partial_load_bias == partial_store_bias);
1577 : :
1578 : 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1579 : : return false;
1580 : :
1581 : : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1582 : : len_loads with a length of zero. In order to avoid that we prohibit
1583 : : more than one loop length here. */
1584 : 0 : if (partial_load_bias == -1
1585 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1586 : : return false;
1587 : :
1588 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1589 : :
1590 : 0 : unsigned int max_nitems_per_iter = 1;
1591 : 0 : unsigned int i;
1592 : 0 : rgroup_controls *rgl;
1593 : : /* Find the maximum number of items per iteration for every rgroup. */
1594 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1595 : : {
1596 : 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1597 : 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1598 : : }
1599 : :
1600 : : /* Work out how many bits we need to represent the length limit. */
1601 : 0 : unsigned int min_ni_prec
1602 : 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1603 : :
1604 : : /* Now use the maximum of below precisions for one suitable IV type:
1605 : : - the IV's natural precision
1606 : : - the precision needed to hold: the maximum number of scalar
1607 : : iterations multiplied by the scale factor (min_ni_prec above)
1608 : : - the Pmode precision
1609 : :
1610 : : If min_ni_prec is less than the precision of the current niters,
1611 : : we perfer to still use the niters type. Prefer to use Pmode and
1612 : : wider IV to avoid narrow conversions. */
1613 : :
1614 : 0 : unsigned int ni_prec
1615 : 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1616 : 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1617 : 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1618 : :
1619 : 0 : tree iv_type = NULL_TREE;
1620 : 0 : opt_scalar_int_mode tmode_iter;
1621 : 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1622 : : {
1623 : 0 : scalar_mode tmode = tmode_iter.require ();
1624 : 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1625 : :
1626 : : /* ??? Do we really want to construct one IV whose precision exceeds
1627 : : BITS_PER_WORD? */
1628 : 0 : if (tbits > BITS_PER_WORD)
1629 : : break;
1630 : :
1631 : : /* Find the first available standard integral type. */
1632 : 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1633 : : {
1634 : 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1635 : 0 : break;
1636 : : }
1637 : : }
1638 : :
1639 : 0 : if (!iv_type)
1640 : : {
1641 : 0 : if (dump_enabled_p ())
1642 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643 : : "can't vectorize with length-based partial vectors"
1644 : : " because there is no suitable iv type.\n");
1645 : 0 : return false;
1646 : : }
1647 : :
1648 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1649 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1650 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1651 : :
1652 : 0 : return true;
1653 : : }
1654 : :
1655 : : /* Calculate the cost of one scalar iteration of the loop. */
1656 : : static void
1657 : 170651 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1658 : : {
1659 : 170651 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1660 : 170651 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1661 : 170651 : int nbbs = loop->num_nodes, factor;
1662 : 170651 : int innerloop_iters, i;
1663 : :
1664 : 170651 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1665 : :
1666 : : /* Gather costs for statements in the scalar loop. */
1667 : :
1668 : : /* FORNOW. */
1669 : 170651 : innerloop_iters = 1;
1670 : 170651 : if (loop->inner)
1671 : 1153 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1672 : :
1673 : 546041 : for (i = 0; i < nbbs; i++)
1674 : : {
1675 : 375390 : gimple_stmt_iterator si;
1676 : 375390 : basic_block bb = bbs[i];
1677 : :
1678 : 375390 : if (bb->loop_father == loop->inner)
1679 : : factor = innerloop_iters;
1680 : : else
1681 : 373084 : factor = 1;
1682 : :
1683 : 2988309 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1684 : : {
1685 : 2237529 : gimple *stmt = gsi_stmt (si);
1686 : 2237529 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1687 : :
1688 : 2237529 : if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1689 : 678095 : continue;
1690 : :
1691 : : /* Skip stmts that are not vectorized inside the loop. */
1692 : 1559434 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1693 : 1559434 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1694 : 666468 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1695 : 72 : || !VECTORIZABLE_CYCLE_DEF
1696 : : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1697 : 666468 : continue;
1698 : :
1699 : 892966 : vect_cost_for_stmt kind;
1700 : 892966 : if (STMT_VINFO_DATA_REF (stmt_info))
1701 : : {
1702 : 420667 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1703 : : kind = scalar_load;
1704 : : else
1705 : 184006 : kind = scalar_store;
1706 : : }
1707 : 472299 : else if (vect_nop_conversion_p (stmt_info))
1708 : 37822 : continue;
1709 : : else
1710 : : kind = scalar_stmt;
1711 : :
1712 : : /* We are using vect_prologue here to avoid scaling twice
1713 : : by the inner loop factor. */
1714 : 855144 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1715 : : factor, kind, stmt_info, 0, vect_prologue);
1716 : : }
1717 : : }
1718 : :
1719 : : /* Now accumulate cost. */
1720 : 170651 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1721 : 170651 : add_stmt_costs (loop_vinfo->scalar_costs,
1722 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1723 : 170651 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1724 : 170651 : }
1725 : :
1726 : : /* Function vect_analyze_loop_form.
1727 : :
1728 : : Verify that certain CFG restrictions hold, including:
1729 : : - the loop has a pre-header
1730 : : - the loop has a single entry
1731 : : - nested loops can have only a single exit.
1732 : : - the loop exit condition is simple enough
1733 : : - the number of iterations can be analyzed, i.e, a countable loop. The
1734 : : niter could be analyzed under some assumptions. */
1735 : :
1736 : : opt_result
1737 : 391639 : vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1738 : : {
1739 : 391639 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1740 : :
1741 : 391639 : edge exit_e = vec_init_loop_exit_info (loop);
1742 : 391639 : if (!exit_e)
1743 : 47590 : return opt_result::failure_at (vect_location,
1744 : : "not vectorized:"
1745 : : " could not determine main exit from"
1746 : : " loop with multiple exits.\n");
1747 : 344049 : info->loop_exit = exit_e;
1748 : 344049 : if (dump_enabled_p ())
1749 : 14670 : dump_printf_loc (MSG_NOTE, vect_location,
1750 : : "using as main loop exit: %d -> %d [AUX: %p]\n",
1751 : 14670 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1752 : :
1753 : : /* Check if we have any control flow that doesn't leave the loop. */
1754 : 344049 : class loop *v_loop = loop->inner ? loop->inner : loop;
1755 : 344049 : basic_block *bbs = get_loop_body (v_loop);
1756 : 1100768 : for (unsigned i = 0; i < v_loop->num_nodes; i++)
1757 : 836377 : if (EDGE_COUNT (bbs[i]->succs) != 1
1758 : 836377 : && (EDGE_COUNT (bbs[i]->succs) != 2
1759 : 507320 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1760 : : {
1761 : 79658 : free (bbs);
1762 : 79658 : return opt_result::failure_at (vect_location,
1763 : : "not vectorized:"
1764 : : " unsupported control flow in loop.\n");
1765 : : }
1766 : 264391 : free (bbs);
1767 : :
1768 : : /* Different restrictions apply when we are considering an inner-most loop,
1769 : : vs. an outer (nested) loop.
1770 : : (FORNOW. May want to relax some of these restrictions in the future). */
1771 : :
1772 : 264391 : info->inner_loop_cond = NULL;
1773 : 264391 : if (!loop->inner)
1774 : : {
1775 : : /* Inner-most loop. */
1776 : :
1777 : 232132 : if (empty_block_p (loop->header))
1778 : 3 : return opt_result::failure_at (vect_location,
1779 : : "not vectorized: empty loop.\n");
1780 : : }
1781 : : else
1782 : : {
1783 : 32259 : class loop *innerloop = loop->inner;
1784 : 32259 : edge entryedge;
1785 : :
1786 : : /* Nested loop. We currently require that the loop is doubly-nested,
1787 : : contains a single inner loop with a single exit to the block
1788 : : with the single exit condition in the outer loop.
1789 : : Vectorizable outer-loops look like this:
1790 : :
1791 : : (pre-header)
1792 : : |
1793 : : header <---+
1794 : : | |
1795 : : inner-loop |
1796 : : | |
1797 : : tail ------+
1798 : : |
1799 : : (exit-bb)
1800 : :
1801 : : The inner-loop also has the properties expected of inner-most loops
1802 : : as described above. */
1803 : :
1804 : 32259 : if ((loop->inner)->inner || (loop->inner)->next)
1805 : 3067 : return opt_result::failure_at (vect_location,
1806 : : "not vectorized:"
1807 : : " multiple nested loops.\n");
1808 : :
1809 : 29192 : entryedge = loop_preheader_edge (innerloop);
1810 : 29192 : if (entryedge->src != loop->header
1811 : 14308 : || !single_exit (innerloop)
1812 : 38834 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1813 : 20497 : return opt_result::failure_at (vect_location,
1814 : : "not vectorized:"
1815 : : " unsupported outerloop form.\n");
1816 : :
1817 : : /* Analyze the inner-loop. */
1818 : 8695 : vect_loop_form_info inner;
1819 : 8695 : opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1820 : 8695 : if (!res)
1821 : : {
1822 : 1137 : if (dump_enabled_p ())
1823 : 5 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824 : : "not vectorized: Bad inner loop.\n");
1825 : 1137 : return res;
1826 : : }
1827 : :
1828 : : /* Don't support analyzing niter under assumptions for inner
1829 : : loop. */
1830 : 7558 : if (!integer_onep (inner.assumptions))
1831 : 271 : return opt_result::failure_at (vect_location,
1832 : : "not vectorized: Bad inner loop.\n");
1833 : :
1834 : 7287 : if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1835 : 1083 : return opt_result::failure_at (vect_location,
1836 : : "not vectorized: inner-loop count not"
1837 : : " invariant.\n");
1838 : :
1839 : 6204 : if (dump_enabled_p ())
1840 : 932 : dump_printf_loc (MSG_NOTE, vect_location,
1841 : : "Considering outer-loop vectorization.\n");
1842 : 6204 : info->inner_loop_cond = inner.conds[0];
1843 : 8695 : }
1844 : :
1845 : 238333 : if (EDGE_COUNT (loop->header->preds) != 2)
1846 : 0 : return opt_result::failure_at (vect_location,
1847 : : "not vectorized:"
1848 : : " too many incoming edges.\n");
1849 : :
1850 : : /* We assume that the latch is empty. */
1851 : 238333 : if (!empty_block_p (loop->latch)
1852 : 238333 : || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1853 : 19611 : return opt_result::failure_at (vect_location,
1854 : : "not vectorized: latch block not empty.\n");
1855 : :
1856 : : /* Make sure there is no abnormal exit. */
1857 : 218722 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1858 : 960204 : for (edge e : exits)
1859 : : {
1860 : 304082 : if (e->flags & EDGE_ABNORMAL)
1861 : 44 : return opt_result::failure_at (vect_location,
1862 : : "not vectorized:"
1863 : : " abnormal loop exit edge.\n");
1864 : : }
1865 : :
1866 : 218678 : info->conds
1867 : 218678 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1868 : : &info->number_of_iterations,
1869 : 218678 : &info->number_of_iterationsm1);
1870 : 218678 : if (info->conds.is_empty ())
1871 : 34 : return opt_result::failure_at
1872 : 34 : (vect_location,
1873 : : "not vectorized: complicated exit condition.\n");
1874 : :
1875 : : /* Determine what the primary and alternate exit conds are. */
1876 : 513748 : for (unsigned i = 0; i < info->conds.length (); i++)
1877 : : {
1878 : 295104 : gcond *cond = info->conds[i];
1879 : 295104 : if (exit_e->src == gimple_bb (cond))
1880 : 218644 : std::swap (info->conds[0], info->conds[i]);
1881 : : }
1882 : :
1883 : 218644 : if (integer_zerop (info->assumptions)
1884 : 218644 : || !info->number_of_iterations
1885 : 437288 : || chrec_contains_undetermined (info->number_of_iterations))
1886 : 32907 : return opt_result::failure_at
1887 : 32907 : (info->conds[0],
1888 : : "not vectorized: number of iterations cannot be computed.\n");
1889 : :
1890 : 185737 : if (integer_zerop (info->number_of_iterations))
1891 : 16 : return opt_result::failure_at
1892 : 16 : (info->conds[0],
1893 : : "not vectorized: number of iterations = 0.\n");
1894 : :
1895 : 185721 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1896 : 98173 : && tree_to_shwi (info->number_of_iterations) > 0))
1897 : : {
1898 : 87548 : if (dump_enabled_p ())
1899 : : {
1900 : 2151 : dump_printf_loc (MSG_NOTE, vect_location,
1901 : : "Symbolic number of iterations is ");
1902 : 2151 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1903 : 2151 : dump_printf (MSG_NOTE, "\n");
1904 : : }
1905 : : }
1906 : :
1907 : 185721 : return opt_result::success ();
1908 : 218722 : }
1909 : :
1910 : : /* Create a loop_vec_info for LOOP with SHARED and the
1911 : : vect_analyze_loop_form result. */
1912 : :
1913 : : loop_vec_info
1914 : 368786 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1915 : : const vect_loop_form_info *info,
1916 : : loop_vec_info main_loop_info)
1917 : : {
1918 : 368786 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1919 : 368786 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1920 : 368786 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1921 : 368786 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1922 : 368786 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1923 : : /* Also record the assumptions for versioning. */
1924 : 368786 : if (!integer_onep (info->assumptions) && !main_loop_info)
1925 : 15670 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1926 : :
1927 : 1637628 : for (gcond *cond : info->conds)
1928 : : {
1929 : 531270 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1930 : 531270 : STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1931 : : /* Mark the statement as a condition. */
1932 : 531270 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1933 : : }
1934 : :
1935 : 1062540 : for (unsigned i = 1; i < info->conds.length (); i ++)
1936 : 162484 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1937 : 368786 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1938 : :
1939 : 368786 : LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1940 : :
1941 : : /* Check to see if we're vectorizing multiple exits. */
1942 : 368786 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1943 : 368786 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1944 : :
1945 : 368786 : if (info->inner_loop_cond)
1946 : : {
1947 : 6404 : stmt_vec_info inner_loop_cond_info
1948 : 6404 : = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1949 : 6404 : STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1950 : : /* If we have an estimate on the number of iterations of the inner
1951 : : loop use that to limit the scale for costing, otherwise use
1952 : : --param vect-inner-loop-cost-factor literally. */
1953 : 6404 : widest_int nit;
1954 : 6404 : if (estimated_stmt_executions (loop->inner, &nit))
1955 : 5395 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1956 : 5395 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1957 : 6404 : }
1958 : :
1959 : 368786 : return loop_vinfo;
1960 : : }
1961 : :
1962 : :
1963 : :
1964 : : /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1965 : : statements update the vectorization factor. */
1966 : :
1967 : : static void
1968 : 9216 : vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1969 : : {
1970 : 9216 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1971 : 9216 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1972 : 9216 : int nbbs = loop->num_nodes;
1973 : 9216 : poly_uint64 vectorization_factor;
1974 : 9216 : int i;
1975 : :
1976 : 9216 : DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1977 : :
1978 : 9216 : vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1979 : 9216 : gcc_assert (known_ne (vectorization_factor, 0U));
1980 : :
1981 : : /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1982 : : vectorization factor of the loop is the unrolling factor required by
1983 : : the SLP instances. If that unrolling factor is 1, we say, that we
1984 : : perform pure SLP on loop - cross iteration parallelism is not
1985 : : exploited. */
1986 : : bool only_slp_in_loop = true;
1987 : 27813 : for (i = 0; i < nbbs; i++)
1988 : : {
1989 : 18597 : basic_block bb = bbs[i];
1990 : 47180 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1991 : 28583 : gsi_next (&si))
1992 : : {
1993 : 28583 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1994 : 28583 : if (!stmt_info)
1995 : 0 : continue;
1996 : 28583 : if ((STMT_VINFO_RELEVANT_P (stmt_info)
1997 : 25225 : || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1998 : 3361 : && !PURE_SLP_STMT (stmt_info))
1999 : : /* STMT needs both SLP and loop-based vectorization. */
2000 : 28583 : only_slp_in_loop = false;
2001 : : }
2002 : 269102 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2003 : 231908 : gsi_next (&si))
2004 : : {
2005 : 231908 : if (is_gimple_debug (gsi_stmt (si)))
2006 : 34728 : continue;
2007 : 197180 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2008 : 197180 : stmt_info = vect_stmt_to_vectorize (stmt_info);
2009 : 197180 : if ((STMT_VINFO_RELEVANT_P (stmt_info)
2010 : 90356 : || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2011 : 106827 : && !PURE_SLP_STMT (stmt_info))
2012 : : /* STMT needs both SLP and loop-based vectorization. */
2013 : 231908 : only_slp_in_loop = false;
2014 : : }
2015 : : }
2016 : :
2017 : 9216 : if (only_slp_in_loop)
2018 : : {
2019 : 7906 : if (dump_enabled_p ())
2020 : 1669 : dump_printf_loc (MSG_NOTE, vect_location,
2021 : : "Loop contains only SLP stmts\n");
2022 : 7906 : vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2023 : : }
2024 : : else
2025 : : {
2026 : 1310 : if (dump_enabled_p ())
2027 : 393 : dump_printf_loc (MSG_NOTE, vect_location,
2028 : : "Loop contains SLP and non-SLP stmts\n");
2029 : : /* Both the vectorization factor and unroll factor have the form
2030 : : GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2031 : : so they must have a common multiple. */
2032 : 1310 : vectorization_factor
2033 : 1310 : = force_common_multiple (vectorization_factor,
2034 : 1310 : LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2035 : : }
2036 : :
2037 : 9216 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2038 : 9216 : if (dump_enabled_p ())
2039 : : {
2040 : 2062 : dump_printf_loc (MSG_NOTE, vect_location,
2041 : : "Updating vectorization factor to ");
2042 : 2062 : dump_dec (MSG_NOTE, vectorization_factor);
2043 : 2062 : dump_printf (MSG_NOTE, ".\n");
2044 : : }
2045 : 9216 : }
2046 : :
2047 : : /* Return true if STMT_INFO describes a double reduction phi and if
2048 : : the other phi in the reduction is also relevant for vectorization.
2049 : : This rejects cases such as:
2050 : :
2051 : : outer1:
2052 : : x_1 = PHI <x_3(outer2), ...>;
2053 : : ...
2054 : :
2055 : : inner:
2056 : : x_2 = ...;
2057 : : ...
2058 : :
2059 : : outer2:
2060 : : x_3 = PHI <x_2(inner)>;
2061 : :
2062 : : if nothing in x_2 or elsewhere makes x_1 relevant. */
2063 : :
2064 : : static bool
2065 : 121 : vect_active_double_reduction_p (stmt_vec_info stmt_info)
2066 : : {
2067 : 121 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2068 : : return false;
2069 : :
2070 : 0 : return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2071 : : }
2072 : :
2073 : : /* Function vect_analyze_loop_operations.
2074 : :
2075 : : Scan the loop stmts and make sure they are all vectorizable. */
2076 : :
2077 : : static opt_result
2078 : 160640 : vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2079 : : {
2080 : 160640 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2081 : 160640 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2082 : 160640 : int nbbs = loop->num_nodes;
2083 : 160640 : int i;
2084 : 160640 : stmt_vec_info stmt_info;
2085 : 160640 : bool need_to_vectorize = false;
2086 : 160640 : bool ok;
2087 : :
2088 : 160640 : DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2089 : :
2090 : 160640 : auto_vec<stmt_info_for_cost> cost_vec;
2091 : :
2092 : 374154 : for (i = 0; i < nbbs; i++)
2093 : : {
2094 : 269586 : basic_block bb = bbs[i];
2095 : :
2096 : 683118 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2097 : 413532 : gsi_next (&si))
2098 : : {
2099 : 426448 : gphi *phi = si.phi ();
2100 : 426448 : ok = true;
2101 : :
2102 : 426448 : stmt_info = loop_vinfo->lookup_stmt (phi);
2103 : 426448 : if (dump_enabled_p ())
2104 : 47732 : dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2105 : : (gimple *) phi);
2106 : 852896 : if (virtual_operand_p (gimple_phi_result (phi)))
2107 : 99095 : continue;
2108 : :
2109 : : /* Inner-loop loop-closed exit phi in outer-loop vectorization
2110 : : (i.e., a phi in the tail of the outer-loop). */
2111 : 327353 : if (! is_loop_header_bb_p (bb))
2112 : : {
2113 : : /* FORNOW: we currently don't support the case that these phis
2114 : : are not used in the outerloop (unless it is double reduction,
2115 : : i.e., this phi is vect_reduction_def), cause this case
2116 : : requires to actually do something here. */
2117 : 592 : if (STMT_VINFO_LIVE_P (stmt_info)
2118 : 686 : && !vect_active_double_reduction_p (stmt_info))
2119 : 27 : return opt_result::failure_at (phi,
2120 : : "Unsupported loop-closed phi"
2121 : : " in outer-loop.\n");
2122 : :
2123 : : /* If PHI is used in the outer loop, we check that its operand
2124 : : is defined in the inner loop. */
2125 : 565 : if (STMT_VINFO_RELEVANT_P (stmt_info))
2126 : : {
2127 : 560 : tree phi_op;
2128 : :
2129 : 560 : if (gimple_phi_num_args (phi) != 1)
2130 : 0 : return opt_result::failure_at (phi, "unsupported phi");
2131 : :
2132 : 560 : phi_op = PHI_ARG_DEF (phi, 0);
2133 : 560 : stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2134 : 560 : if (!op_def_info)
2135 : 0 : return opt_result::failure_at (phi, "unsupported phi\n");
2136 : :
2137 : 560 : if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2138 : 560 : && (STMT_VINFO_RELEVANT (op_def_info)
2139 : : != vect_used_in_outer_by_reduction))
2140 : 106 : return opt_result::failure_at (phi, "unsupported phi\n");
2141 : :
2142 : 454 : if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2143 : 93 : || (STMT_VINFO_DEF_TYPE (stmt_info)
2144 : : == vect_double_reduction_def))
2145 : 547 : && !vectorizable_lc_phi (loop_vinfo,
2146 : : stmt_info, NULL, NULL))
2147 : 0 : return opt_result::failure_at (phi, "unsupported phi\n");
2148 : : }
2149 : :
2150 : 459 : continue;
2151 : 459 : }
2152 : :
2153 : 326761 : gcc_assert (stmt_info);
2154 : :
2155 : 326761 : if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2156 : 282496 : || STMT_VINFO_LIVE_P (stmt_info))
2157 : 77076 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2158 : 486 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2159 : : /* A scalar-dependence cycle that we don't support. */
2160 : 334 : return opt_result::failure_at (phi,
2161 : : "not vectorized:"
2162 : : " scalar dependence cycle.\n");
2163 : :
2164 : 326427 : if (STMT_VINFO_RELEVANT_P (stmt_info))
2165 : : {
2166 : 140751 : need_to_vectorize = true;
2167 : 140751 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2168 : 84164 : && ! PURE_SLP_STMT (stmt_info))
2169 : 83691 : ok = vectorizable_induction (loop_vinfo,
2170 : : stmt_info, NULL, NULL,
2171 : : &cost_vec);
2172 : 57060 : else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2173 : : || (STMT_VINFO_DEF_TYPE (stmt_info)
2174 : : == vect_double_reduction_def)
2175 : 57060 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2176 : 56401 : && ! PURE_SLP_STMT (stmt_info))
2177 : 54287 : ok = vectorizable_reduction (loop_vinfo,
2178 : : stmt_info, NULL, NULL, &cost_vec);
2179 : 2773 : else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2180 : : == vect_first_order_recurrence)
2181 : 186 : && ! PURE_SLP_STMT (stmt_info))
2182 : 162 : ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2183 : : &cost_vec);
2184 : : }
2185 : :
2186 : : /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2187 : 138140 : if (ok
2188 : 313978 : && STMT_VINFO_LIVE_P (stmt_info)
2189 : 41614 : && !PURE_SLP_STMT (stmt_info))
2190 : 41611 : ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2191 : : -1, false, &cost_vec);
2192 : :
2193 : 313978 : if (!ok)
2194 : 12449 : return opt_result::failure_at (phi,
2195 : : "not vectorized: relevant phi not "
2196 : : "supported: %G",
2197 : : static_cast <gimple *> (phi));
2198 : : }
2199 : :
2200 : 2065238 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2201 : 1551898 : gsi_next (&si))
2202 : : {
2203 : 1595054 : gimple *stmt = gsi_stmt (si);
2204 : 1595054 : if (!gimple_clobber_p (stmt)
2205 : 1595054 : && !is_gimple_debug (stmt))
2206 : : {
2207 : 1279369 : opt_result res
2208 : 1279369 : = vect_analyze_stmt (loop_vinfo,
2209 : : loop_vinfo->lookup_stmt (stmt),
2210 : : &need_to_vectorize,
2211 : : NULL, NULL, &cost_vec);
2212 : 1279369 : if (!res)
2213 : 43156 : return res;
2214 : : }
2215 : : }
2216 : : } /* bbs */
2217 : :
2218 : 104568 : add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2219 : :
2220 : : /* All operations in the loop are either irrelevant (deal with loop
2221 : : control, or dead), or only used outside the loop and can be moved
2222 : : out of the loop (e.g. invariants, inductions). The loop can be
2223 : : optimized away by scalar optimizations. We're better off not
2224 : : touching this loop. */
2225 : 104568 : if (!need_to_vectorize)
2226 : : {
2227 : 49 : if (dump_enabled_p ())
2228 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2229 : : "All the computation can be taken out of the loop.\n");
2230 : 49 : return opt_result::failure_at
2231 : 49 : (vect_location,
2232 : : "not vectorized: redundant loop. no profit to vectorize.\n");
2233 : : }
2234 : :
2235 : 104519 : return opt_result::success ();
2236 : 160640 : }
2237 : :
2238 : : /* Return true if we know that the iteration count is smaller than the
2239 : : vectorization factor. Return false if it isn't, or if we can't be sure
2240 : : either way. */
2241 : :
2242 : : static bool
2243 : 93392 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2244 : : {
2245 : 93392 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2246 : :
2247 : 93392 : HOST_WIDE_INT max_niter;
2248 : 93392 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2249 : 46453 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2250 : : else
2251 : 46939 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2252 : :
2253 : 93392 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2254 : 7483 : return true;
2255 : :
2256 : : return false;
2257 : : }
2258 : :
2259 : : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2260 : : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2261 : : definitely no, or -1 if it's worth retrying. */
2262 : :
2263 : : static int
2264 : 93396 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2265 : : unsigned *suggested_unroll_factor)
2266 : : {
2267 : 93396 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2268 : 93396 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2269 : :
2270 : : /* Only loops that can handle partially-populated vectors can have iteration
2271 : : counts less than the vectorization factor. */
2272 : 93396 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2273 : 93396 : && vect_known_niters_smaller_than_vf (loop_vinfo))
2274 : : {
2275 : 7478 : if (dump_enabled_p ())
2276 : 187 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2277 : : "not vectorized: iteration count smaller than "
2278 : : "vectorization factor.\n");
2279 : 7478 : return 0;
2280 : : }
2281 : :
2282 : : /* If we know the number of iterations we can do better, for the
2283 : : epilogue we can also decide whether the main loop leaves us
2284 : : with enough iterations, prefering a smaller vector epilog then
2285 : : also possibly used for the case we skip the vector loop. */
2286 : 85918 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2287 : : {
2288 : 39262 : widest_int scalar_niters
2289 : 39262 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2290 : 39262 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2291 : : {
2292 : 2240 : loop_vec_info orig_loop_vinfo
2293 : : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2294 : 2240 : unsigned lowest_vf
2295 : 2240 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2296 : 2240 : int prolog_peeling = 0;
2297 : 2240 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2298 : 2240 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2299 : 2240 : if (prolog_peeling >= 0
2300 : 2240 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2301 : : lowest_vf))
2302 : : {
2303 : 4470 : unsigned gap
2304 : 2235 : = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2305 : 4470 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
2306 : 4470 : % lowest_vf + gap);
2307 : : }
2308 : : }
2309 : : /* Reject vectorizing for a single scalar iteration, even if
2310 : : we could in principle implement that using partial vectors. */
2311 : 39262 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2312 : 39262 : if (scalar_niters <= peeling_gap + 1)
2313 : : {
2314 : 721 : if (dump_enabled_p ())
2315 : 155 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316 : : "not vectorized: loop only has a single "
2317 : : "scalar iteration.\n");
2318 : 721 : return 0;
2319 : : }
2320 : :
2321 : 38541 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2322 : : {
2323 : : /* Check that the loop processes at least one full vector. */
2324 : 38535 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2325 : 38535 : if (known_lt (scalar_niters, vf))
2326 : : {
2327 : 372 : if (dump_enabled_p ())
2328 : 313 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2329 : : "loop does not have enough iterations "
2330 : : "to support vectorization.\n");
2331 : 445 : return 0;
2332 : : }
2333 : :
2334 : : /* If we need to peel an extra epilogue iteration to handle data
2335 : : accesses with gaps, check that there are enough scalar iterations
2336 : : available.
2337 : :
2338 : : The check above is redundant with this one when peeling for gaps,
2339 : : but the distinction is useful for diagnostics. */
2340 : 38163 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2341 : 38807 : && known_le (scalar_niters, vf))
2342 : : {
2343 : 73 : if (dump_enabled_p ())
2344 : 17 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345 : : "loop does not have enough iterations "
2346 : : "to support peeling for gaps.\n");
2347 : 73 : return 0;
2348 : : }
2349 : : }
2350 : 39262 : }
2351 : :
2352 : : /* If using the "very cheap" model. reject cases in which we'd keep
2353 : : a copy of the scalar code (even if we might be able to vectorize it). */
2354 : 84752 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2355 : 84752 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2356 : 46200 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357 : 44799 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2358 : : {
2359 : 31531 : if (dump_enabled_p ())
2360 : 256 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361 : : "some scalar iterations would need to be peeled\n");
2362 : 31531 : return 0;
2363 : : }
2364 : :
2365 : 53221 : int min_profitable_iters, min_profitable_estimate;
2366 : 53221 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2367 : : &min_profitable_estimate,
2368 : : suggested_unroll_factor);
2369 : :
2370 : 53221 : if (min_profitable_iters < 0)
2371 : : {
2372 : 5168 : if (dump_enabled_p ())
2373 : 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2374 : : "not vectorized: vectorization not profitable.\n");
2375 : 5168 : if (dump_enabled_p ())
2376 : 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377 : : "not vectorized: vector version will never be "
2378 : : "profitable.\n");
2379 : 5168 : return -1;
2380 : : }
2381 : :
2382 : 48053 : int min_scalar_loop_bound = (param_min_vect_loop_bound
2383 : 48053 : * assumed_vf);
2384 : :
2385 : : /* Use the cost model only if it is more conservative than user specified
2386 : : threshold. */
2387 : 48053 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2388 : : min_profitable_iters);
2389 : :
2390 : 48053 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2391 : :
2392 : 29676 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2393 : 77729 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2394 : : {
2395 : 88 : if (dump_enabled_p ())
2396 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2397 : : "not vectorized: vectorization not profitable.\n");
2398 : 88 : if (dump_enabled_p ())
2399 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2400 : : "not vectorized: iteration count smaller than user "
2401 : : "specified loop bound parameter or minimum profitable "
2402 : : "iterations (whichever is more conservative).\n");
2403 : 88 : return 0;
2404 : : }
2405 : :
2406 : : /* The static profitablity threshold min_profitable_estimate includes
2407 : : the cost of having to check at runtime whether the scalar loop
2408 : : should be used instead. If it turns out that we don't need or want
2409 : : such a check, the threshold we should use for the static estimate
2410 : : is simply the point at which the vector loop becomes more profitable
2411 : : than the scalar loop. */
2412 : 47965 : if (min_profitable_estimate > min_profitable_iters
2413 : 5961 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2414 : 5877 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2415 : 290 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2416 : 48255 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2417 : : {
2418 : 1 : if (dump_enabled_p ())
2419 : 0 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2420 : : " choice between the scalar and vector loops\n");
2421 : 1 : min_profitable_estimate = min_profitable_iters;
2422 : : }
2423 : :
2424 : : /* If the vector loop needs multiple iterations to be beneficial then
2425 : : things are probably too close to call, and the conservative thing
2426 : : would be to stick with the scalar code. */
2427 : 47965 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2428 : 47965 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2429 : : {
2430 : 777 : if (dump_enabled_p ())
2431 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432 : : "one iteration of the vector loop would be"
2433 : : " more expensive than the equivalent number of"
2434 : : " iterations of the scalar loop\n");
2435 : 777 : return 0;
2436 : : }
2437 : :
2438 : 47188 : HOST_WIDE_INT estimated_niter;
2439 : :
2440 : : /* If we are vectorizing an epilogue then we know the maximum number of
2441 : : scalar iterations it will cover is at least one lower than the
2442 : : vectorization factor of the main loop. */
2443 : 47188 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2444 : 7897 : estimated_niter
2445 : 7897 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2446 : : else
2447 : : {
2448 : 39291 : estimated_niter = estimated_stmt_executions_int (loop);
2449 : 39291 : if (estimated_niter == -1)
2450 : 11786 : estimated_niter = likely_max_stmt_executions_int (loop);
2451 : : }
2452 : 19683 : if (estimated_niter != -1
2453 : 46078 : && ((unsigned HOST_WIDE_INT) estimated_niter
2454 : 46078 : < MAX (th, (unsigned) min_profitable_estimate)))
2455 : : {
2456 : 1829 : if (dump_enabled_p ())
2457 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2458 : : "not vectorized: estimated iteration count too "
2459 : : "small.\n");
2460 : 1829 : if (dump_enabled_p ())
2461 : 8 : dump_printf_loc (MSG_NOTE, vect_location,
2462 : : "not vectorized: estimated iteration count smaller "
2463 : : "than specified loop bound parameter or minimum "
2464 : : "profitable iterations (whichever is more "
2465 : : "conservative).\n");
2466 : 1829 : return -1;
2467 : : }
2468 : :
2469 : : return 1;
2470 : : }
2471 : :
2472 : : static opt_result
2473 : 185138 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2474 : : vec<data_reference_p> *datarefs,
2475 : : unsigned int *n_stmts)
2476 : : {
2477 : 185138 : *n_stmts = 0;
2478 : 550191 : for (unsigned i = 0; i < loop->num_nodes; i++)
2479 : 807336 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2480 : 2870616 : !gsi_end_p (gsi); gsi_next (&gsi))
2481 : : {
2482 : 2505563 : gimple *stmt = gsi_stmt (gsi);
2483 : 2505563 : if (is_gimple_debug (stmt))
2484 : 792093 : continue;
2485 : 1713630 : ++(*n_stmts);
2486 : 1713630 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2487 : : NULL, 0);
2488 : 1713630 : if (!res)
2489 : : {
2490 : 38775 : if (is_gimple_call (stmt) && loop->safelen)
2491 : : {
2492 : 402 : tree fndecl = gimple_call_fndecl (stmt), op;
2493 : 402 : if (fndecl == NULL_TREE
2494 : 402 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2495 : : {
2496 : 0 : fndecl = gimple_call_arg (stmt, 0);
2497 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2498 : 0 : fndecl = TREE_OPERAND (fndecl, 0);
2499 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2500 : : }
2501 : 402 : if (fndecl != NULL_TREE)
2502 : : {
2503 : 368 : cgraph_node *node = cgraph_node::get (fndecl);
2504 : 368 : if (node != NULL && node->simd_clones != NULL)
2505 : : {
2506 : 161 : unsigned int j, n = gimple_call_num_args (stmt);
2507 : 655 : for (j = 0; j < n; j++)
2508 : : {
2509 : 334 : op = gimple_call_arg (stmt, j);
2510 : 334 : if (DECL_P (op)
2511 : 334 : || (REFERENCE_CLASS_P (op)
2512 : 0 : && get_base_address (op)))
2513 : : break;
2514 : : }
2515 : 161 : op = gimple_call_lhs (stmt);
2516 : : /* Ignore #pragma omp declare simd functions
2517 : : if they don't have data references in the
2518 : : call stmt itself. */
2519 : 321 : if (j == n
2520 : 161 : && !(op
2521 : 150 : && (DECL_P (op)
2522 : 150 : || (REFERENCE_CLASS_P (op)
2523 : 0 : && get_base_address (op)))))
2524 : 160 : continue;
2525 : : }
2526 : : }
2527 : : }
2528 : 38615 : return res;
2529 : : }
2530 : : /* If dependence analysis will give up due to the limit on the
2531 : : number of datarefs stop here and fail fatally. */
2532 : 1674855 : if (datarefs->length ()
2533 : 1674855 : > (unsigned)param_loop_max_datarefs_for_datadeps)
2534 : 0 : return opt_result::failure_at (stmt, "exceeded param "
2535 : : "loop-max-datarefs-for-datadeps\n");
2536 : : }
2537 : 146523 : return opt_result::success ();
2538 : : }
2539 : :
2540 : : /* Look for SLP-only access groups and turn each individual access into its own
2541 : : group. */
2542 : : static void
2543 : 160640 : vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2544 : : {
2545 : 160640 : unsigned int i;
2546 : 160640 : struct data_reference *dr;
2547 : :
2548 : 160640 : DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2549 : :
2550 : 160640 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2551 : 703828 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2552 : : {
2553 : 390136 : gcc_assert (DR_REF (dr));
2554 : 390136 : stmt_vec_info stmt_info
2555 : 390136 : = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (DR_STMT (dr)));
2556 : :
2557 : : /* Check if the load is a part of an interleaving chain. */
2558 : 390136 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2559 : : {
2560 : 120811 : stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2561 : 120811 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2562 : 120811 : unsigned int group_size = DR_GROUP_SIZE (first_element);
2563 : :
2564 : : /* Check if SLP-only groups. */
2565 : 120811 : if (!STMT_SLP_TYPE (stmt_info)
2566 : 60093 : && STMT_VINFO_SLP_VECT_ONLY (first_element))
2567 : : {
2568 : : /* Dissolve the group. */
2569 : 29 : STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2570 : :
2571 : 29 : stmt_vec_info vinfo = first_element;
2572 : 105 : while (vinfo)
2573 : : {
2574 : 76 : stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2575 : 76 : DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2576 : 76 : DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2577 : 76 : DR_GROUP_SIZE (vinfo) = 1;
2578 : 76 : if (STMT_VINFO_STRIDED_P (first_element)
2579 : : /* We cannot handle stores with gaps. */
2580 : 67 : || DR_IS_WRITE (dr_info->dr))
2581 : : {
2582 : 12 : STMT_VINFO_STRIDED_P (vinfo) = true;
2583 : 12 : DR_GROUP_GAP (vinfo) = 0;
2584 : : }
2585 : : else
2586 : 64 : DR_GROUP_GAP (vinfo) = group_size - 1;
2587 : : /* Duplicate and adjust alignment info, it needs to
2588 : : be present on each group leader, see dr_misalignment. */
2589 : 76 : if (vinfo != first_element)
2590 : : {
2591 : 47 : dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2592 : 47 : dr_info2->target_alignment = dr_info->target_alignment;
2593 : 47 : int misalignment = dr_info->misalignment;
2594 : 47 : if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2595 : : {
2596 : 12 : HOST_WIDE_INT diff
2597 : 12 : = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2598 : 12 : - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2599 : 12 : unsigned HOST_WIDE_INT align_c
2600 : 12 : = dr_info->target_alignment.to_constant ();
2601 : 12 : misalignment = (misalignment + diff) % align_c;
2602 : : }
2603 : 47 : dr_info2->misalignment = misalignment;
2604 : : }
2605 : : vinfo = next;
2606 : : }
2607 : : }
2608 : : }
2609 : : }
2610 : 160640 : }
2611 : :
2612 : : /* Determine if operating on full vectors for LOOP_VINFO might leave
2613 : : some scalar iterations still to do. If so, decide how we should
2614 : : handle those scalar iterations. The possibilities are:
2615 : :
2616 : : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2617 : : In this case:
2618 : :
2619 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2620 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2621 : : LOOP_VINFO_PEELING_FOR_NITER == false
2622 : :
2623 : : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2624 : : to handle the remaining scalar iterations. In this case:
2625 : :
2626 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2627 : : LOOP_VINFO_PEELING_FOR_NITER == true
2628 : :
2629 : : There are two choices:
2630 : :
2631 : : (2a) Consider vectorizing the epilogue loop at the same VF as the
2632 : : main loop, but using partial vectors instead of full vectors.
2633 : : In this case:
2634 : :
2635 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2636 : :
2637 : : (2b) Consider vectorizing the epilogue loop at lower VFs only.
2638 : : In this case:
2639 : :
2640 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2641 : : */
2642 : :
2643 : : opt_result
2644 : 110798 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2645 : : {
2646 : : /* Determine whether there would be any scalar iterations left over. */
2647 : 110798 : bool need_peeling_or_partial_vectors_p
2648 : 110798 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2649 : :
2650 : : /* Decide whether to vectorize the loop with partial vectors. */
2651 : 110798 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2652 : 110798 : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2653 : 110798 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2654 : 10 : && need_peeling_or_partial_vectors_p)
2655 : : {
2656 : : /* For partial-vector-usage=1, try to push the handling of partial
2657 : : vectors to the epilogue, with the main loop continuing to operate
2658 : : on full vectors.
2659 : :
2660 : : If we are unrolling we also do not want to use partial vectors. This
2661 : : is to avoid the overhead of generating multiple masks and also to
2662 : : avoid having to execute entire iterations of FALSE masked instructions
2663 : : when dealing with one or less full iterations.
2664 : :
2665 : : ??? We could then end up failing to use partial vectors if we
2666 : : decide to peel iterations into a prologue, and if the main loop
2667 : : then ends up processing fewer than VF iterations. */
2668 : 9 : if ((param_vect_partial_vector_usage == 1
2669 : 4 : || loop_vinfo->suggested_unroll_factor > 1)
2670 : 5 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2671 : 14 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2672 : 0 : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2673 : : else
2674 : 9 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2675 : : }
2676 : :
2677 : 110798 : if (dump_enabled_p ())
2678 : 13905 : dump_printf_loc (MSG_NOTE, vect_location,
2679 : : "operating on %s vectors%s.\n",
2680 : 13905 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2681 : : ? "partial" : "full",
2682 : 13905 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2683 : : ? " for epilogue loop" : "");
2684 : :
2685 : 110798 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2686 : 221596 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2687 : 110798 : && need_peeling_or_partial_vectors_p);
2688 : :
2689 : : /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2690 : : analysis that we don't know whether the loop is vectorized by partial
2691 : : vectors (More details see tree-vect-loop-manip.cc).
2692 : :
2693 : : However, SELECT_VL vectorizaton style should only applied on partial
2694 : : vectorization since SELECT_VL is the GIMPLE IR that calculates the
2695 : : number of elements to be process for each iteration.
2696 : :
2697 : : After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2698 : : if it is not partial vectorized loop. */
2699 : 110798 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2700 : 110789 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2701 : :
2702 : 110798 : return opt_result::success ();
2703 : : }
2704 : :
2705 : : /* Function vect_analyze_loop_2.
2706 : :
2707 : : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2708 : : analyses will record information in some members of LOOP_VINFO. FATAL
2709 : : indicates if some analysis meets fatal error. If one non-NULL pointer
2710 : : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2711 : : worked out suggested unroll factor, while one NULL pointer shows it's
2712 : : going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2713 : : is to hold the slp decision when the suggested unroll factor is worked
2714 : : out. */
2715 : : static opt_result
2716 : 368018 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2717 : : unsigned *suggested_unroll_factor,
2718 : : bool& slp_done_for_suggested_uf)
2719 : : {
2720 : 368018 : opt_result ok = opt_result::success ();
2721 : 368018 : int res;
2722 : 368018 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2723 : 368018 : poly_uint64 min_vf = 2;
2724 : 368018 : loop_vec_info orig_loop_vinfo = NULL;
2725 : :
2726 : : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2727 : : loop_vec_info of the first vectorized loop. */
2728 : 368018 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2729 : 27042 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2730 : : else
2731 : : orig_loop_vinfo = loop_vinfo;
2732 : 27042 : gcc_assert (orig_loop_vinfo);
2733 : :
2734 : : /* The first group of checks is independent of the vector size. */
2735 : 368018 : fatal = true;
2736 : :
2737 : 368018 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2738 : 368018 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2739 : 5 : return opt_result::failure_at (vect_location,
2740 : : "not vectorized: simd if(0)\n");
2741 : :
2742 : : /* Find all data references in the loop (which correspond to vdefs/vuses)
2743 : : and analyze their evolution in the loop. */
2744 : :
2745 : 368013 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2746 : :
2747 : : /* Gather the data references and count stmts in the loop. */
2748 : 368013 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2749 : : {
2750 : 185138 : opt_result res
2751 : 185138 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2752 : : &LOOP_VINFO_DATAREFS (loop_vinfo),
2753 : : &LOOP_VINFO_N_STMTS (loop_vinfo));
2754 : 185138 : if (!res)
2755 : : {
2756 : 38615 : if (dump_enabled_p ())
2757 : 1454 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2758 : : "not vectorized: loop contains function "
2759 : : "calls or data references that cannot "
2760 : : "be analyzed\n");
2761 : 38615 : return res;
2762 : : }
2763 : 146523 : loop_vinfo->shared->save_datarefs ();
2764 : : }
2765 : : else
2766 : 182875 : loop_vinfo->shared->check_datarefs ();
2767 : :
2768 : : /* Analyze the data references and also adjust the minimal
2769 : : vectorization factor according to the loads and stores. */
2770 : :
2771 : 329398 : ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2772 : 329398 : if (!ok)
2773 : : {
2774 : 44751 : if (dump_enabled_p ())
2775 : 1006 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776 : : "bad data references.\n");
2777 : 44751 : return ok;
2778 : : }
2779 : :
2780 : : /* Check if we are applying unroll factor now. */
2781 : 284647 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2782 : 284647 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2783 : :
2784 : : /* If the slp decision is false when suggested unroll factor is worked
2785 : : out, and we are applying suggested unroll factor, we can simply skip
2786 : : all slp related analyses this time. */
2787 : 284647 : bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2788 : :
2789 : : /* Classify all cross-iteration scalar data-flow cycles.
2790 : : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2791 : 284647 : vect_analyze_scalar_cycles (loop_vinfo, slp);
2792 : :
2793 : 284647 : vect_pattern_recog (loop_vinfo);
2794 : :
2795 : 284647 : vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2796 : :
2797 : : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2798 : : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2799 : :
2800 : 284647 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2801 : 284647 : if (!ok)
2802 : : {
2803 : 4148 : if (dump_enabled_p ())
2804 : 278 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2805 : : "bad data access.\n");
2806 : 4148 : return ok;
2807 : : }
2808 : :
2809 : : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2810 : :
2811 : 280499 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2812 : 280499 : if (!ok)
2813 : : {
2814 : 10476 : if (dump_enabled_p ())
2815 : 327 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2816 : : "unexpected pattern.\n");
2817 : 10476 : return ok;
2818 : : }
2819 : :
2820 : : /* While the rest of the analysis below depends on it in some way. */
2821 : 270023 : fatal = false;
2822 : :
2823 : : /* Analyze data dependences between the data-refs in the loop
2824 : : and adjust the maximum vectorization factor according to
2825 : : the dependences.
2826 : : FORNOW: fail at the first data dependence that we encounter. */
2827 : :
2828 : 270023 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2829 : 270023 : if (!ok)
2830 : : {
2831 : 65513 : if (dump_enabled_p ())
2832 : 572 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2833 : : "bad data dependence.\n");
2834 : 65513 : return ok;
2835 : : }
2836 : 204510 : if (max_vf != MAX_VECTORIZATION_FACTOR
2837 : 204510 : && maybe_lt (max_vf, min_vf))
2838 : 48 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2839 : 204462 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2840 : :
2841 : 204462 : ok = vect_determine_vectorization_factor (loop_vinfo);
2842 : 204462 : if (!ok)
2843 : : {
2844 : 33811 : if (dump_enabled_p ())
2845 : 711 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2846 : : "can't determine vectorization factor.\n");
2847 : 33811 : return ok;
2848 : : }
2849 : :
2850 : : /* Compute the scalar iteration cost. */
2851 : 170651 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2852 : :
2853 : 170651 : poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2854 : :
2855 : 170651 : if (slp)
2856 : : {
2857 : : /* Check the SLP opportunities in the loop, analyze and build
2858 : : SLP trees. */
2859 : 170651 : ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2860 : 170651 : if (!ok)
2861 : 0 : return ok;
2862 : :
2863 : : /* If there are any SLP instances mark them as pure_slp. */
2864 : 170651 : slp = vect_make_slp_decision (loop_vinfo);
2865 : 170651 : if (slp)
2866 : : {
2867 : : /* Find stmts that need to be both vectorized and SLPed. */
2868 : 9216 : vect_detect_hybrid_slp (loop_vinfo);
2869 : :
2870 : : /* Update the vectorization factor based on the SLP decision. */
2871 : 9216 : vect_update_vf_for_slp (loop_vinfo);
2872 : :
2873 : : /* Optimize the SLP graph with the vectorization factor fixed. */
2874 : 9216 : vect_optimize_slp (loop_vinfo);
2875 : :
2876 : : /* Gather the loads reachable from the SLP graph entries. */
2877 : 9216 : vect_gather_slp_loads (loop_vinfo);
2878 : : }
2879 : : }
2880 : :
2881 : 170651 : bool saved_can_use_partial_vectors_p
2882 : : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2883 : :
2884 : : /* We don't expect to have to roll back to anything other than an empty
2885 : : set of rgroups. */
2886 : 170651 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2887 : :
2888 : : /* This is the point where we can re-start analysis with SLP forced off. */
2889 : 170651 : start_over:
2890 : :
2891 : : /* Apply the suggested unrolling factor, this was determined by the backend
2892 : : during finish_cost the first time we ran the analyzis for this
2893 : : vector mode. */
2894 : 171521 : if (applying_suggested_uf)
2895 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2896 : :
2897 : : /* Now the vectorization factor is final. */
2898 : 171521 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2899 : 171521 : gcc_assert (known_ne (vectorization_factor, 0U));
2900 : :
2901 : 171521 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2902 : : {
2903 : 12148 : dump_printf_loc (MSG_NOTE, vect_location,
2904 : : "vectorization_factor = ");
2905 : 12148 : dump_dec (MSG_NOTE, vectorization_factor);
2906 : 12148 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2907 : 12148 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2908 : : }
2909 : :
2910 : 171521 : if (max_vf != MAX_VECTORIZATION_FACTOR
2911 : 171521 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2912 : 1 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2913 : :
2914 : 171520 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2915 : :
2916 : : /* Analyze the alignment of the data-refs in the loop.
2917 : : Fail if a data reference is found that cannot be vectorized. */
2918 : :
2919 : 171520 : ok = vect_analyze_data_refs_alignment (loop_vinfo);
2920 : 171520 : if (!ok)
2921 : : {
2922 : 0 : if (dump_enabled_p ())
2923 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2924 : : "bad data alignment.\n");
2925 : 0 : return ok;
2926 : : }
2927 : :
2928 : : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2929 : : It is important to call pruning after vect_analyze_data_ref_accesses,
2930 : : since we use grouping information gathered by interleaving analysis. */
2931 : 171520 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2932 : 171520 : if (!ok)
2933 : 10144 : return ok;
2934 : :
2935 : : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2936 : : vectorization, since we do not want to add extra peeling or
2937 : : add versioning for alignment. */
2938 : 161376 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2939 : : /* This pass will decide on using loop versioning and/or loop peeling in
2940 : : order to enhance the alignment of data references in the loop. */
2941 : 139202 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2942 : 161376 : if (!ok)
2943 : 0 : return ok;
2944 : :
2945 : 161376 : if (slp)
2946 : : {
2947 : : /* Analyze operations in the SLP instances. Note this may
2948 : : remove unsupported SLP instances which makes the above
2949 : : SLP kind detection invalid. */
2950 : 8663 : unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2951 : 8663 : vect_slp_analyze_operations (loop_vinfo);
2952 : 17326 : if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2953 : : {
2954 : 736 : ok = opt_result::failure_at (vect_location,
2955 : : "unsupported SLP instances\n");
2956 : 736 : goto again;
2957 : : }
2958 : :
2959 : : /* Check whether any load in ALL SLP instances is possibly permuted. */
2960 : : slp_tree load_node, slp_root;
2961 : : unsigned i, x;
2962 : : slp_instance instance;
2963 : : bool can_use_lanes = true;
2964 : 7927 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2965 : : {
2966 : 7927 : slp_root = SLP_INSTANCE_TREE (instance);
2967 : 7927 : int group_size = SLP_TREE_LANES (slp_root);
2968 : 7927 : tree vectype = SLP_TREE_VECTYPE (slp_root);
2969 : 7927 : bool loads_permuted = false;
2970 : 17004 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2971 : : {
2972 : 9077 : if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2973 : 5485 : continue;
2974 : : unsigned j;
2975 : : stmt_vec_info load_info;
2976 : 14883 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2977 : 5625 : if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2978 : : {
2979 : : loads_permuted = true;
2980 : : break;
2981 : : }
2982 : : }
2983 : :
2984 : : /* If the loads and stores can be handled with load/store-lane
2985 : : instructions record it and move on to the next instance. */
2986 : 7927 : if (loads_permuted
2987 : 1373 : && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2988 : 8656 : && vect_store_lanes_supported (vectype, group_size, false)
2989 : : != IFN_LAST)
2990 : : {
2991 : 0 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2992 : 0 : if (STMT_VINFO_GROUPED_ACCESS
2993 : : (SLP_TREE_REPRESENTATIVE (load_node)))
2994 : : {
2995 : 0 : stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2996 : : (SLP_TREE_REPRESENTATIVE (load_node));
2997 : : /* Use SLP for strided accesses (or if we can't
2998 : : load-lanes). */
2999 : 0 : if (STMT_VINFO_STRIDED_P (stmt_vinfo)
3000 : 0 : || vect_load_lanes_supported
3001 : 0 : (STMT_VINFO_VECTYPE (stmt_vinfo),
3002 : 0 : DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
3003 : : break;
3004 : : }
3005 : :
3006 : 0 : can_use_lanes
3007 : 0 : = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3008 : :
3009 : 0 : if (can_use_lanes && dump_enabled_p ())
3010 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3011 : : "SLP instance %p can use load/store-lanes\n",
3012 : : (void *) instance);
3013 : : }
3014 : : else
3015 : : {
3016 : : can_use_lanes = false;
3017 : : break;
3018 : : }
3019 : : }
3020 : :
3021 : : /* If all SLP instances can use load/store-lanes abort SLP and try again
3022 : : with SLP disabled. */
3023 : 7927 : if (can_use_lanes)
3024 : : {
3025 : 0 : ok = opt_result::failure_at (vect_location,
3026 : : "Built SLP cancelled: can use "
3027 : : "load/store-lanes\n");
3028 : 0 : if (dump_enabled_p ())
3029 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3030 : : "Built SLP cancelled: all SLP instances support "
3031 : : "load/store-lanes\n");
3032 : 0 : goto again;
3033 : : }
3034 : : }
3035 : :
3036 : : /* Dissolve SLP-only groups. */
3037 : 160640 : vect_dissolve_slp_only_groups (loop_vinfo);
3038 : :
3039 : : /* Scan all the remaining operations in the loop that are not subject
3040 : : to SLP and make sure they are vectorizable. */
3041 : 160640 : ok = vect_analyze_loop_operations (loop_vinfo);
3042 : 160640 : if (!ok)
3043 : : {
3044 : 56121 : if (dump_enabled_p ())
3045 : 3576 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3046 : : "bad operation or unsupported loop bound.\n");
3047 : 56121 : return ok;
3048 : : }
3049 : :
3050 : : /* For now, we don't expect to mix both masking and length approaches for one
3051 : : loop, disable it if both are recorded. */
3052 : 104519 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3053 : 12 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3054 : 104531 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3055 : : {
3056 : 0 : if (dump_enabled_p ())
3057 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3058 : : "can't vectorize a loop with partial vectors"
3059 : : " because we don't expect to mix different"
3060 : : " approaches with partial vectors for the"
3061 : : " same loop.\n");
3062 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3063 : : }
3064 : :
3065 : : /* If we still have the option of using partial vectors,
3066 : : check whether we can generate the necessary loop controls. */
3067 : 104519 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3068 : : {
3069 : 12 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3070 : : {
3071 : 12 : if (!vect_verify_full_masking (loop_vinfo)
3072 : 12 : && !vect_verify_full_masking_avx512 (loop_vinfo))
3073 : 2 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3074 : : }
3075 : : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3076 : 0 : if (!vect_verify_loop_lens (loop_vinfo))
3077 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3078 : : }
3079 : :
3080 : : /* If we're vectorizing a loop that uses length "controls" and
3081 : : can iterate more than once, we apply decrementing IV approach
3082 : : in loop control. */
3083 : 104519 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3084 : 10 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3085 : 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3086 : 104519 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3087 : 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3088 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3089 : 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3090 : :
3091 : : /* If a loop uses length controls and has a decrementing loop control IV,
3092 : : we will normally pass that IV through a MIN_EXPR to calcaluate the
3093 : : basis for the length controls. E.g. in a loop that processes one
3094 : : element per scalar iteration, the number of elements would be
3095 : : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3096 : :
3097 : : This MIN_EXPR approach allows us to use pointer IVs with an invariant
3098 : : step, since only the final iteration of the vector loop can have
3099 : : inactive lanes.
3100 : :
3101 : : However, some targets have a dedicated instruction for calculating the
3102 : : preferred length, given the total number of elements that still need to
3103 : : be processed. This is encapsulated in the SELECT_VL internal function.
3104 : :
3105 : : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3106 : : to determine the basis for the length controls. However, unlike the
3107 : : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3108 : : lanes inactive in any iteration of the vector loop, not just the last
3109 : : iteration. This SELECT_VL approach therefore requires us to use pointer
3110 : : IVs with variable steps.
3111 : :
3112 : : Once we've decided how many elements should be processed by one
3113 : : iteration of the vector loop, we need to populate the rgroup controls.
3114 : : If a loop has multiple rgroups, we need to make sure that those rgroups
3115 : : "line up" (that is, they must be consistent about which elements are
3116 : : active and which aren't). This is done by vect_adjust_loop_lens_control.
3117 : :
3118 : : In principle, it would be possible to use vect_adjust_loop_lens_control
3119 : : on either the result of a MIN_EXPR or the result of a SELECT_VL.
3120 : : However:
3121 : :
3122 : : (1) In practice, it only makes sense to use SELECT_VL when a vector
3123 : : operation will be controlled directly by the result. It is not
3124 : : worth using SELECT_VL if it would only be the input to other
3125 : : calculations.
3126 : :
3127 : : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3128 : : pointer IV will need N updates by a variable amount (N-1 updates
3129 : : within the iteration and 1 update to move to the next iteration).
3130 : :
3131 : : Because of this, we prefer to use the MIN_EXPR approach whenever there
3132 : : is more than one length control.
3133 : :
3134 : : In addition, SELECT_VL always operates to a granularity of 1 unit.
3135 : : If we wanted to use it to control an SLP operation on N consecutive
3136 : : elements, we would need to make the SELECT_VL inputs measure scalar
3137 : : iterations (rather than elements) and then multiply the SELECT_VL
3138 : : result by N. But using SELECT_VL this way is inefficient because
3139 : : of (1) above.
3140 : :
3141 : : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3142 : : satisfied:
3143 : :
3144 : : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3145 : : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3146 : :
3147 : : Since SELECT_VL (variable step) will make SCEV analysis failed and then
3148 : : we will fail to gain benefits of following unroll optimizations. We prefer
3149 : : using the MIN_EXPR approach in this situation. */
3150 : 104519 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3151 : : {
3152 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3153 : 0 : if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3154 : : OPTIMIZE_FOR_SPEED)
3155 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3156 : 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3157 : 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3158 : : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3159 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3160 : : }
3161 : :
3162 : : /* Decide whether this loop_vinfo should use partial vectors or peeling,
3163 : : assuming that the loop will be used as a main loop. We will redo
3164 : : this analysis later if we instead decide to use the loop as an
3165 : : epilogue loop. */
3166 : 104519 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3167 : 104519 : if (!ok)
3168 : 0 : return ok;
3169 : :
3170 : : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3171 : : to be able to handle fewer than VF scalars, or needs to have a lower VF
3172 : : than the main loop. */
3173 : 104519 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3174 : 20265 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3175 : : {
3176 : 20265 : poly_uint64 unscaled_vf
3177 : 20265 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3178 : : orig_loop_vinfo->suggested_unroll_factor);
3179 : 20265 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3180 : 11123 : return opt_result::failure_at (vect_location,
3181 : : "Vectorization factor too high for"
3182 : : " epilogue loop.\n");
3183 : : }
3184 : :
3185 : : /* Check the costings of the loop make vectorizing worthwhile. */
3186 : 93396 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3187 : 93396 : if (res < 0)
3188 : : {
3189 : 6997 : ok = opt_result::failure_at (vect_location,
3190 : : "Loop costings may not be worthwhile.\n");
3191 : 6997 : goto again;
3192 : : }
3193 : 86399 : if (!res)
3194 : 41040 : return opt_result::failure_at (vect_location,
3195 : : "Loop costings not worthwhile.\n");
3196 : :
3197 : : /* If an epilogue loop is required make sure we can create one. */
3198 : 45359 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3199 : 44860 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3200 : 26257 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3201 : : {
3202 : 19974 : if (dump_enabled_p ())
3203 : 3914 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3204 : 19974 : if (!vect_can_advance_ivs_p (loop_vinfo)
3205 : 39897 : || !slpeel_can_duplicate_loop_p (loop,
3206 : : LOOP_VINFO_IV_EXIT (loop_vinfo),
3207 : 19923 : LOOP_VINFO_IV_EXIT (loop_vinfo)))
3208 : : {
3209 : 52 : ok = opt_result::failure_at (vect_location,
3210 : : "not vectorized: can't create required "
3211 : : "epilog loop\n");
3212 : 52 : goto again;
3213 : : }
3214 : : }
3215 : :
3216 : : /* During peeling, we need to check if number of loop iterations is
3217 : : enough for both peeled prolog loop and vector loop. This check
3218 : : can be merged along with threshold check of loop versioning, so
3219 : : increase threshold for this case if necessary.
3220 : :
3221 : : If we are analyzing an epilogue we still want to check what its
3222 : : versioning threshold would be. If we decide to vectorize the epilogues we
3223 : : will want to use the lowest versioning threshold of all epilogues and main
3224 : : loop. This will enable us to enter a vectorized epilogue even when
3225 : : versioning the loop. We can't simply check whether the epilogue requires
3226 : : versioning though since we may have skipped some versioning checks when
3227 : : analyzing the epilogue. For instance, checks for alias versioning will be
3228 : : skipped when dealing with epilogues as we assume we already checked them
3229 : : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3230 : 45307 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3231 : : {
3232 : 4899 : poly_uint64 niters_th = 0;
3233 : 4899 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3234 : :
3235 : 4899 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3236 : : {
3237 : : /* Niters for peeled prolog loop. */
3238 : 4899 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3239 : : {
3240 : 5 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3241 : 5 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3242 : 5 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3243 : : }
3244 : : else
3245 : 4894 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3246 : : }
3247 : :
3248 : : /* Niters for at least one iteration of vectorized loop. */
3249 : 4899 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3250 : 4899 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3251 : : /* One additional iteration because of peeling for gap. */
3252 : 4899 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3253 : 58 : niters_th += 1;
3254 : :
3255 : : /* Use the same condition as vect_transform_loop to decide when to use
3256 : : the cost to determine a versioning threshold. */
3257 : 4899 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3258 : 4899 : && ordered_p (th, niters_th))
3259 : 3365 : niters_th = ordered_max (poly_uint64 (th), niters_th);
3260 : :
3261 : 4899 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3262 : : }
3263 : :
3264 : 45307 : gcc_assert (known_eq (vectorization_factor,
3265 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3266 : :
3267 : 45307 : slp_done_for_suggested_uf = slp;
3268 : :
3269 : : /* Ok to vectorize! */
3270 : 45307 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3271 : 45307 : return opt_result::success ();
3272 : :
3273 : 7785 : again:
3274 : : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3275 : 7785 : gcc_assert (!ok);
3276 : :
3277 : : /* Try again with SLP forced off but if we didn't do any SLP there is
3278 : : no point in re-trying. */
3279 : 7785 : if (!slp)
3280 : 6615 : return ok;
3281 : :
3282 : : /* If the slp decision is true when suggested unroll factor is worked
3283 : : out, and we are applying suggested unroll factor, we don't need to
3284 : : re-try any more. */
3285 : 1170 : if (applying_suggested_uf && slp_done_for_suggested_uf)
3286 : 0 : return ok;
3287 : :
3288 : : /* If there are reduction chains re-trying will fail anyway. */
3289 : 1170 : if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3290 : 91 : return ok;
3291 : :
3292 : : /* Likewise if the grouped loads or stores in the SLP cannot be handled
3293 : : via interleaving or lane instructions. */
3294 : : slp_instance instance;
3295 : : slp_tree node;
3296 : : unsigned i, j;
3297 : 1402 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3298 : : {
3299 : 532 : stmt_vec_info vinfo;
3300 : 532 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3301 : 532 : if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3302 : 48 : continue;
3303 : 484 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3304 : 484 : unsigned int size = DR_GROUP_SIZE (vinfo);
3305 : 484 : tree vectype = STMT_VINFO_VECTYPE (vinfo);
3306 : 484 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3307 : 822 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3308 : 951 : && ! vect_grouped_store_supported (vectype, size))
3309 : 129 : return opt_result::failure_at (vinfo->stmt,
3310 : : "unsupported grouped store\n");
3311 : 702 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3312 : : {
3313 : 280 : vinfo = SLP_TREE_REPRESENTATIVE (node);
3314 : 280 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3315 : : {
3316 : 270 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3317 : 270 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3318 : 270 : size = DR_GROUP_SIZE (vinfo);
3319 : 270 : vectype = STMT_VINFO_VECTYPE (vinfo);
3320 : 270 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3321 : 270 : && ! vect_grouped_load_supported (vectype, single_element_p,
3322 : : size))
3323 : 80 : return opt_result::failure_at (vinfo->stmt,
3324 : : "unsupported grouped load\n");
3325 : : }
3326 : : }
3327 : : }
3328 : :
3329 : 870 : if (dump_enabled_p ())
3330 : 334 : dump_printf_loc (MSG_NOTE, vect_location,
3331 : : "re-trying with SLP disabled\n");
3332 : :
3333 : : /* Roll back state appropriately. No SLP this time. */
3334 : 870 : slp = false;
3335 : : /* Restore vectorization factor as it were without SLP. */
3336 : 870 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3337 : : /* Free the SLP instances. */
3338 : 1187 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3339 : 317 : vect_free_slp_instance (instance);
3340 : 870 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3341 : : /* Reset SLP type to loop_vect on all stmts. */
3342 : 2628 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3343 : : {
3344 : 1758 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3345 : 1758 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
3346 : 4648 : !gsi_end_p (si); gsi_next (&si))
3347 : : {
3348 : 2890 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3349 : 2890 : STMT_SLP_TYPE (stmt_info) = loop_vect;
3350 : 2890 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3351 : 2890 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3352 : : {
3353 : : /* vectorizable_reduction adjusts reduction stmt def-types,
3354 : : restore them to that of the PHI. */
3355 : 486 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3356 : 486 : = STMT_VINFO_DEF_TYPE (stmt_info);
3357 : 486 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3358 : : (STMT_VINFO_REDUC_DEF (stmt_info)))
3359 : 486 : = STMT_VINFO_DEF_TYPE (stmt_info);
3360 : : }
3361 : : }
3362 : 3516 : for (gimple_stmt_iterator si = gsi_start_bb (bb);
3363 : 22314 : !gsi_end_p (si); gsi_next (&si))
3364 : : {
3365 : 20556 : if (is_gimple_debug (gsi_stmt (si)))
3366 : 1653 : continue;
3367 : 18903 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3368 : 18903 : STMT_SLP_TYPE (stmt_info) = loop_vect;
3369 : 18903 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3370 : : {
3371 : 454 : stmt_vec_info pattern_stmt_info
3372 : : = STMT_VINFO_RELATED_STMT (stmt_info);
3373 : 454 : if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3374 : 0 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3375 : :
3376 : 454 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3377 : 454 : STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3378 : 454 : for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3379 : 1026 : !gsi_end_p (pi); gsi_next (&pi))
3380 : 572 : STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3381 : 572 : = loop_vect;
3382 : : }
3383 : : }
3384 : : }
3385 : : /* Free optimized alias test DDRS. */
3386 : 870 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3387 : 870 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3388 : 870 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3389 : : /* Reset target cost data. */
3390 : 870 : delete loop_vinfo->vector_costs;
3391 : 870 : loop_vinfo->vector_costs = nullptr;
3392 : : /* Reset accumulated rgroup information. */
3393 : 870 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3394 : 870 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3395 : 870 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3396 : : /* Reset assorted flags. */
3397 : 870 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3398 : 870 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3399 : 870 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3400 : 870 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3401 : 870 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3402 : 870 : = saved_can_use_partial_vectors_p;
3403 : 870 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
3404 : :
3405 : 870 : goto start_over;
3406 : : }
3407 : :
3408 : : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3409 : : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3410 : : OLD_LOOP_VINFO is better unless something specifically indicates
3411 : : otherwise.
3412 : :
3413 : : Note that this deliberately isn't a partial order. */
3414 : :
3415 : : static bool
3416 : 0 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3417 : : loop_vec_info old_loop_vinfo)
3418 : : {
3419 : 0 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3420 : 0 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3421 : :
3422 : 0 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3423 : 0 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3424 : :
3425 : : /* Always prefer a VF of loop->simdlen over any other VF. */
3426 : 0 : if (loop->simdlen)
3427 : : {
3428 : 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3429 : 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3430 : 0 : if (new_simdlen_p != old_simdlen_p)
3431 : : return new_simdlen_p;
3432 : : }
3433 : :
3434 : 0 : const auto *old_costs = old_loop_vinfo->vector_costs;
3435 : 0 : const auto *new_costs = new_loop_vinfo->vector_costs;
3436 : 0 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3437 : 0 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3438 : :
3439 : 0 : return new_costs->better_main_loop_than_p (old_costs);
3440 : : }
3441 : :
3442 : : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3443 : : true if we should. */
3444 : :
3445 : : static bool
3446 : 0 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3447 : : loop_vec_info old_loop_vinfo)
3448 : : {
3449 : 0 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3450 : : return false;
3451 : :
3452 : 0 : if (dump_enabled_p ())
3453 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3454 : : "***** Preferring vector mode %s to vector mode %s\n",
3455 : 0 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
3456 : 0 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
3457 : : return true;
3458 : : }
3459 : :
3460 : : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3461 : : not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3462 : : MODE_I to the next mode useful to analyze.
3463 : : Return the loop_vinfo on success and wrapped null on failure. */
3464 : :
3465 : : static opt_loop_vec_info
3466 : 368018 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3467 : : const vect_loop_form_info *loop_form_info,
3468 : : loop_vec_info main_loop_vinfo,
3469 : : const vector_modes &vector_modes, unsigned &mode_i,
3470 : : machine_mode &autodetected_vector_mode,
3471 : : bool &fatal)
3472 : : {
3473 : 368018 : loop_vec_info loop_vinfo
3474 : 368018 : = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3475 : :
3476 : 368018 : machine_mode vector_mode = vector_modes[mode_i];
3477 : 368018 : loop_vinfo->vector_mode = vector_mode;
3478 : 368018 : unsigned int suggested_unroll_factor = 1;
3479 : 368018 : bool slp_done_for_suggested_uf = false;
3480 : :
3481 : : /* Run the main analysis. */
3482 : 368018 : opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3483 : : &suggested_unroll_factor,
3484 : : slp_done_for_suggested_uf);
3485 : 368018 : if (dump_enabled_p ())
3486 : 20678 : dump_printf_loc (MSG_NOTE, vect_location,
3487 : : "***** Analysis %s with vector mode %s\n",
3488 : 20678 : res ? "succeeded" : " failed",
3489 : 20678 : GET_MODE_NAME (loop_vinfo->vector_mode));
3490 : :
3491 : 368018 : if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3492 : : {
3493 : 0 : if (dump_enabled_p ())
3494 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3495 : : "***** Re-trying analysis for unrolling"
3496 : : " with unroll factor %d and slp %s.\n",
3497 : : suggested_unroll_factor,
3498 : : slp_done_for_suggested_uf ? "on" : "off");
3499 : 0 : loop_vec_info unroll_vinfo
3500 : 0 : = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3501 : 0 : unroll_vinfo->vector_mode = vector_mode;
3502 : 0 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3503 : 0 : opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3504 : : slp_done_for_suggested_uf);
3505 : 0 : if (new_res)
3506 : : {
3507 : 0 : delete loop_vinfo;
3508 : 0 : loop_vinfo = unroll_vinfo;
3509 : : }
3510 : : else
3511 : 0 : delete unroll_vinfo;
3512 : : }
3513 : :
3514 : : /* Remember the autodetected vector mode. */
3515 : 368018 : if (vector_mode == VOIDmode)
3516 : 177395 : autodetected_vector_mode = loop_vinfo->vector_mode;
3517 : :
3518 : : /* Advance mode_i, first skipping modes that would result in the
3519 : : same analysis result. */
3520 : 1637184 : while (mode_i + 1 < vector_modes.length ()
3521 : 1137983 : && vect_chooses_same_modes_p (loop_vinfo,
3522 : 503400 : vector_modes[mode_i + 1]))
3523 : : {
3524 : 266565 : if (dump_enabled_p ())
3525 : 15090 : dump_printf_loc (MSG_NOTE, vect_location,
3526 : : "***** The result for vector mode %s would"
3527 : : " be the same\n",
3528 : 15090 : GET_MODE_NAME (vector_modes[mode_i + 1]));
3529 : 266565 : mode_i += 1;
3530 : : }
3531 : 368018 : if (mode_i + 1 < vector_modes.length ()
3532 : 236835 : && VECTOR_MODE_P (autodetected_vector_mode)
3533 : 473670 : && (related_vector_mode (vector_modes[mode_i + 1],
3534 : : GET_MODE_INNER (autodetected_vector_mode))
3535 : 236835 : == autodetected_vector_mode)
3536 : 604853 : && (related_vector_mode (autodetected_vector_mode,
3537 : 281 : GET_MODE_INNER (vector_modes[mode_i + 1]))
3538 : 562 : == vector_modes[mode_i + 1]))
3539 : : {
3540 : 281 : if (dump_enabled_p ())
3541 : 2 : dump_printf_loc (MSG_NOTE, vect_location,
3542 : : "***** Skipping vector mode %s, which would"
3543 : : " repeat the analysis for %s\n",
3544 : 2 : GET_MODE_NAME (vector_modes[mode_i + 1]),
3545 : 2 : GET_MODE_NAME (autodetected_vector_mode));
3546 : 281 : mode_i += 1;
3547 : : }
3548 : 368018 : mode_i++;
3549 : :
3550 : 368018 : if (!res)
3551 : : {
3552 : 322711 : delete loop_vinfo;
3553 : 322711 : if (fatal)
3554 : 53011 : gcc_checking_assert (main_loop_vinfo == NULL);
3555 : 322711 : return opt_loop_vec_info::propagate_failure (res);
3556 : : }
3557 : :
3558 : 45307 : return opt_loop_vec_info::success (loop_vinfo);
3559 : : }
3560 : :
3561 : : /* Function vect_analyze_loop.
3562 : :
3563 : : Apply a set of analyses on LOOP, and create a loop_vec_info struct
3564 : : for it. The different analyses will record information in the
3565 : : loop_vec_info struct. */
3566 : : opt_loop_vec_info
3567 : 399020 : vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3568 : : {
3569 : 399020 : DUMP_VECT_SCOPE ("analyze_loop_nest");
3570 : :
3571 : 399020 : if (loop_outer (loop)
3572 : 399020 : && loop_vec_info_for_loop (loop_outer (loop))
3573 : 399422 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3574 : 402 : return opt_loop_vec_info::failure_at (vect_location,
3575 : : "outer-loop already vectorized.\n");
3576 : :
3577 : 398618 : if (!find_loop_nest (loop, &shared->loop_nest))
3578 : 16963 : return opt_loop_vec_info::failure_at
3579 : 16963 : (vect_location,
3580 : : "not vectorized: loop nest containing two or more consecutive inner"
3581 : : " loops cannot be vectorized\n");
3582 : :
3583 : : /* Analyze the loop form. */
3584 : 381655 : vect_loop_form_info loop_form_info;
3585 : 381655 : opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3586 : 381655 : if (!res)
3587 : : {
3588 : 204260 : if (dump_enabled_p ())
3589 : 1697 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3590 : : "bad loop form.\n");
3591 : 204260 : return opt_loop_vec_info::propagate_failure (res);
3592 : : }
3593 : 177395 : if (!integer_onep (loop_form_info.assumptions))
3594 : : {
3595 : : /* We consider to vectorize this loop by versioning it under
3596 : : some assumptions. In order to do this, we need to clear
3597 : : existing information computed by scev and niter analyzer. */
3598 : 7282 : scev_reset_htab ();
3599 : 7282 : free_numbers_of_iterations_estimates (loop);
3600 : : /* Also set flag for this loop so that following scev and niter
3601 : : analysis are done under the assumptions. */
3602 : 7282 : loop_constraint_set (loop, LOOP_C_FINITE);
3603 : : }
3604 : : else
3605 : : /* Clear the existing niter information to make sure the nonwrapping flag
3606 : : will be calculated and set propriately. */
3607 : 170113 : free_numbers_of_iterations_estimates (loop);
3608 : :
3609 : 177395 : auto_vector_modes vector_modes;
3610 : : /* Autodetect first vector size we try. */
3611 : 177395 : vector_modes.safe_push (VOIDmode);
3612 : 177395 : unsigned int autovec_flags
3613 : 354790 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3614 : 177395 : loop->simdlen != 0);
3615 : 177395 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3616 : 177395 : && !unlimited_cost_model (loop));
3617 : 177395 : machine_mode autodetected_vector_mode = VOIDmode;
3618 : 177395 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3619 : 177395 : unsigned int mode_i = 0;
3620 : 177395 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3621 : :
3622 : : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3623 : : a mode has not been analyzed. */
3624 : 177395 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
3625 : 1789886 : for (unsigned i = 0; i < vector_modes.length (); ++i)
3626 : 717548 : cached_vf_per_mode.safe_push (0);
3627 : :
3628 : : /* First determine the main loop vectorization mode, either the first
3629 : : one that works, starting with auto-detecting the vector mode and then
3630 : : following the targets order of preference, or the one with the
3631 : : lowest cost if pick_lowest_cost_p. */
3632 : 504557 : while (1)
3633 : : {
3634 : 340976 : bool fatal;
3635 : 340976 : unsigned int last_mode_i = mode_i;
3636 : : /* Set cached VF to -1 prior to analysis, which indicates a mode has
3637 : : failed. */
3638 : 340976 : cached_vf_per_mode[last_mode_i] = -1;
3639 : 340976 : opt_loop_vec_info loop_vinfo
3640 : 340976 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3641 : : NULL, vector_modes, mode_i,
3642 : : autodetected_vector_mode, fatal);
3643 : 340976 : if (fatal)
3644 : : break;
3645 : :
3646 : 287965 : if (loop_vinfo)
3647 : : {
3648 : : /* Analyzis has been successful so update the VF value. The
3649 : : VF should always be a multiple of unroll_factor and we want to
3650 : : capture the original VF here. */
3651 : 39027 : cached_vf_per_mode[last_mode_i]
3652 : 39027 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3653 : 39027 : loop_vinfo->suggested_unroll_factor);
3654 : : /* Once we hit the desired simdlen for the first time,
3655 : : discard any previous attempts. */
3656 : 39027 : if (simdlen
3657 : 39027 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3658 : : {
3659 : 47 : delete first_loop_vinfo;
3660 : : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3661 : : simdlen = 0;
3662 : : }
3663 : 38980 : else if (pick_lowest_cost_p
3664 : 0 : && first_loop_vinfo
3665 : 38980 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3666 : : {
3667 : : /* Pick loop_vinfo over first_loop_vinfo. */
3668 : 0 : delete first_loop_vinfo;
3669 : 0 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3670 : : }
3671 : 39027 : if (first_loop_vinfo == NULL)
3672 : : first_loop_vinfo = loop_vinfo;
3673 : : else
3674 : : {
3675 : 3 : delete loop_vinfo;
3676 : 3 : loop_vinfo = opt_loop_vec_info::success (NULL);
3677 : : }
3678 : :
3679 : : /* Commit to first_loop_vinfo if we have no reason to try
3680 : : alternatives. */
3681 : 39027 : if (!simdlen && !pick_lowest_cost_p)
3682 : : break;
3683 : : }
3684 : 248948 : if (mode_i == vector_modes.length ()
3685 : 248948 : || autodetected_vector_mode == VOIDmode)
3686 : : break;
3687 : :
3688 : : /* Try the next biggest vector size. */
3689 : 163581 : if (dump_enabled_p ())
3690 : 3888 : dump_printf_loc (MSG_NOTE, vect_location,
3691 : : "***** Re-trying analysis with vector mode %s\n",
3692 : 3888 : GET_MODE_NAME (vector_modes[mode_i]));
3693 : 163581 : }
3694 : 177395 : if (!first_loop_vinfo)
3695 : 138376 : return opt_loop_vec_info::propagate_failure (res);
3696 : :
3697 : 39019 : if (dump_enabled_p ())
3698 : 8384 : dump_printf_loc (MSG_NOTE, vect_location,
3699 : : "***** Choosing vector mode %s\n",
3700 : 8384 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3701 : :
3702 : : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3703 : : enabled, SIMDUID is not set, it is the innermost loop and we have
3704 : : either already found the loop's SIMDLEN or there was no SIMDLEN to
3705 : : begin with.
3706 : : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3707 : 39019 : bool vect_epilogues = (!simdlen
3708 : 39017 : && loop->inner == NULL
3709 : 38599 : && param_vect_epilogues_nomask
3710 : 37593 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3711 : : /* No code motion support for multiple epilogues so for now
3712 : : not supported when multiple exits. */
3713 : 13091 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3714 : 51876 : && !loop->simduid);
3715 : 39019 : if (!vect_epilogues)
3716 : 27553 : return first_loop_vinfo;
3717 : :
3718 : : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3719 : 11466 : poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3720 : :
3721 : : /* For epilogues start the analysis from the first mode. The motivation
3722 : : behind starting from the beginning comes from cases where the VECTOR_MODES
3723 : : array may contain length-agnostic and length-specific modes. Their
3724 : : ordering is not guaranteed, so we could end up picking a mode for the main
3725 : : loop that is after the epilogue's optimal mode. */
3726 : 11466 : vector_modes[0] = autodetected_vector_mode;
3727 : 11466 : mode_i = 0;
3728 : :
3729 : 11466 : bool supports_partial_vectors =
3730 : 11466 : partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3731 : 11466 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3732 : :
3733 : 39411 : while (1)
3734 : : {
3735 : : /* If the target does not support partial vectors we can shorten the
3736 : : number of modes to analyze for the epilogue as we know we can't pick a
3737 : : mode that would lead to a VF at least as big as the
3738 : : FIRST_VINFO_VF. */
3739 : 51753 : if (!supports_partial_vectors
3740 : 39411 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3741 : : {
3742 : 12369 : mode_i++;
3743 : 24738 : if (mode_i == vector_modes.length ())
3744 : : break;
3745 : 12342 : continue;
3746 : : }
3747 : :
3748 : 27042 : if (dump_enabled_p ())
3749 : 5058 : dump_printf_loc (MSG_NOTE, vect_location,
3750 : : "***** Re-trying epilogue analysis with vector "
3751 : 5058 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3752 : :
3753 : 27042 : bool fatal;
3754 : 27042 : opt_loop_vec_info loop_vinfo
3755 : 27042 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3756 : : first_loop_vinfo,
3757 : : vector_modes, mode_i,
3758 : : autodetected_vector_mode, fatal);
3759 : 27042 : if (fatal)
3760 : : break;
3761 : :
3762 : 27042 : if (loop_vinfo)
3763 : : {
3764 : 6280 : if (pick_lowest_cost_p)
3765 : : {
3766 : : /* Keep trying to roll back vectorization attempts while the
3767 : : loop_vec_infos they produced were worse than this one. */
3768 : : vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3769 : 0 : while (!vinfos.is_empty ()
3770 : 0 : && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3771 : : {
3772 : 0 : gcc_assert (vect_epilogues);
3773 : 0 : delete vinfos.pop ();
3774 : : }
3775 : : }
3776 : : /* For now only allow one epilogue loop. */
3777 : 6280 : if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3778 : : {
3779 : 6280 : first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3780 : 6280 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3781 : 6280 : gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3782 : : || maybe_ne (lowest_th, 0U));
3783 : : /* Keep track of the known smallest versioning
3784 : : threshold. */
3785 : 6280 : if (ordered_p (lowest_th, th))
3786 : 6280 : lowest_th = ordered_min (lowest_th, th);
3787 : : }
3788 : : else
3789 : : {
3790 : 0 : delete loop_vinfo;
3791 : 0 : loop_vinfo = opt_loop_vec_info::success (NULL);
3792 : : }
3793 : :
3794 : : /* For now only allow one epilogue loop, but allow
3795 : : pick_lowest_cost_p to replace it, so commit to the
3796 : : first epilogue if we have no reason to try alternatives. */
3797 : 6280 : if (!pick_lowest_cost_p)
3798 : : break;
3799 : : }
3800 : :
3801 : 41524 : if (mode_i == vector_modes.length ())
3802 : : break;
3803 : :
3804 : : }
3805 : :
3806 : 11466 : if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3807 : : {
3808 : 6280 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3809 : 6280 : if (dump_enabled_p ())
3810 : 1261 : dump_printf_loc (MSG_NOTE, vect_location,
3811 : : "***** Choosing epilogue vector mode %s\n",
3812 : 1261 : GET_MODE_NAME
3813 : : (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3814 : : }
3815 : :
3816 : 11466 : return first_loop_vinfo;
3817 : 559050 : }
3818 : :
3819 : : /* Return true if there is an in-order reduction function for CODE, storing
3820 : : it in *REDUC_FN if so. */
3821 : :
3822 : : static bool
3823 : 2790 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3824 : : {
3825 : : /* We support MINUS_EXPR by negating the operand. This also preserves an
3826 : : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3827 : : (-0.0) = -0.0. */
3828 : 2790 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3829 : : {
3830 : 2468 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3831 : 2468 : return true;
3832 : : }
3833 : : return false;
3834 : : }
3835 : :
3836 : : /* Function reduction_fn_for_scalar_code
3837 : :
3838 : : Input:
3839 : : CODE - tree_code of a reduction operations.
3840 : :
3841 : : Output:
3842 : : REDUC_FN - the corresponding internal function to be used to reduce the
3843 : : vector of partial results into a single scalar result, or IFN_LAST
3844 : : if the operation is a supported reduction operation, but does not have
3845 : : such an internal function.
3846 : :
3847 : : Return FALSE if CODE currently cannot be vectorized as reduction. */
3848 : :
3849 : : bool
3850 : 1727808 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3851 : : {
3852 : 1727808 : if (code.is_tree_code ())
3853 : 1727769 : switch (tree_code (code))
3854 : : {
3855 : 7864 : case MAX_EXPR:
3856 : 7864 : *reduc_fn = IFN_REDUC_MAX;
3857 : 7864 : return true;
3858 : :
3859 : 43002 : case MIN_EXPR:
3860 : 43002 : *reduc_fn = IFN_REDUC_MIN;
3861 : 43002 : return true;
3862 : :
3863 : 911001 : case PLUS_EXPR:
3864 : 911001 : *reduc_fn = IFN_REDUC_PLUS;
3865 : 911001 : return true;
3866 : :
3867 : 221494 : case BIT_AND_EXPR:
3868 : 221494 : *reduc_fn = IFN_REDUC_AND;
3869 : 221494 : return true;
3870 : :
3871 : 262683 : case BIT_IOR_EXPR:
3872 : 262683 : *reduc_fn = IFN_REDUC_IOR;
3873 : 262683 : return true;
3874 : :
3875 : 32750 : case BIT_XOR_EXPR:
3876 : 32750 : *reduc_fn = IFN_REDUC_XOR;
3877 : 32750 : return true;
3878 : :
3879 : 248975 : case MULT_EXPR:
3880 : 248975 : case MINUS_EXPR:
3881 : 248975 : *reduc_fn = IFN_LAST;
3882 : 248975 : return true;
3883 : :
3884 : : default:
3885 : : return false;
3886 : : }
3887 : : else
3888 : 39 : switch (combined_fn (code))
3889 : : {
3890 : 21 : CASE_CFN_FMAX:
3891 : 21 : *reduc_fn = IFN_REDUC_FMAX;
3892 : 21 : return true;
3893 : :
3894 : 18 : CASE_CFN_FMIN:
3895 : 18 : *reduc_fn = IFN_REDUC_FMIN;
3896 : 18 : return true;
3897 : :
3898 : : default:
3899 : : return false;
3900 : : }
3901 : : }
3902 : :
3903 : : /* If there is a neutral value X such that a reduction would not be affected
3904 : : by the introduction of additional X elements, return that X, otherwise
3905 : : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3906 : : of the scalar elements. If the reduction has just a single initial value
3907 : : then INITIAL_VALUE is that value, otherwise it is null.
3908 : : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3909 : : In that case no signed zero is returned. */
3910 : :
3911 : : tree
3912 : 25383 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3913 : : tree initial_value, bool as_initial)
3914 : : {
3915 : 25383 : if (code.is_tree_code ())
3916 : 25377 : switch (tree_code (code))
3917 : : {
3918 : 807 : case DOT_PROD_EXPR:
3919 : 807 : case SAD_EXPR:
3920 : 807 : case MINUS_EXPR:
3921 : 807 : case BIT_IOR_EXPR:
3922 : 807 : case BIT_XOR_EXPR:
3923 : 807 : return build_zero_cst (scalar_type);
3924 : 23394 : case WIDEN_SUM_EXPR:
3925 : 23394 : case PLUS_EXPR:
3926 : 23394 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3927 : 7 : return build_real (scalar_type, dconstm0);
3928 : : else
3929 : 23387 : return build_zero_cst (scalar_type);
3930 : :
3931 : 183 : case MULT_EXPR:
3932 : 183 : return build_one_cst (scalar_type);
3933 : :
3934 : 549 : case BIT_AND_EXPR:
3935 : 549 : return build_all_ones_cst (scalar_type);
3936 : :
3937 : : case MAX_EXPR:
3938 : : case MIN_EXPR:
3939 : : return initial_value;
3940 : :
3941 : 0 : default:
3942 : 0 : return NULL_TREE;
3943 : : }
3944 : : else
3945 : 6 : switch (combined_fn (code))
3946 : : {
3947 : : CASE_CFN_FMIN:
3948 : : CASE_CFN_FMAX:
3949 : : return initial_value;
3950 : :
3951 : 0 : default:
3952 : 0 : return NULL_TREE;
3953 : : }
3954 : : }
3955 : :
3956 : : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3957 : : STMT is printed with a message MSG. */
3958 : :
3959 : : static void
3960 : 474 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3961 : : {
3962 : 474 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3963 : 474 : }
3964 : :
3965 : : /* Return true if we need an in-order reduction for operation CODE
3966 : : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3967 : : overflow must wrap. */
3968 : :
3969 : : bool
3970 : 5300151 : needs_fold_left_reduction_p (tree type, code_helper code)
3971 : : {
3972 : : /* CHECKME: check for !flag_finite_math_only too? */
3973 : 5300151 : if (SCALAR_FLOAT_TYPE_P (type))
3974 : : {
3975 : 521395 : if (code.is_tree_code ())
3976 : 521361 : switch (tree_code (code))
3977 : : {
3978 : : case MIN_EXPR:
3979 : : case MAX_EXPR:
3980 : : return false;
3981 : :
3982 : 520173 : default:
3983 : 520173 : return !flag_associative_math;
3984 : : }
3985 : : else
3986 : 34 : switch (combined_fn (code))
3987 : : {
3988 : : CASE_CFN_FMIN:
3989 : : CASE_CFN_FMAX:
3990 : : return false;
3991 : :
3992 : 1 : default:
3993 : 1 : return !flag_associative_math;
3994 : : }
3995 : : }
3996 : :
3997 : 4778756 : if (INTEGRAL_TYPE_P (type))
3998 : 4778050 : return (!code.is_tree_code ()
3999 : 4778050 : || !operation_no_trapping_overflow (type, tree_code (code)));
4000 : :
4001 : 706 : if (SAT_FIXED_POINT_TYPE_P (type))
4002 : : return true;
4003 : :
4004 : : return false;
4005 : : }
4006 : :
4007 : : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4008 : : has a handled computation expression. Store the main reduction
4009 : : operation in *CODE. */
4010 : :
4011 : : static bool
4012 : 70882 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4013 : : tree loop_arg, code_helper *code,
4014 : : vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4015 : : {
4016 : 70882 : auto_bitmap visited;
4017 : 70882 : tree lookfor = PHI_RESULT (phi);
4018 : 70882 : ssa_op_iter curri;
4019 : 70882 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4020 : 147934 : while (USE_FROM_PTR (curr) != loop_arg)
4021 : 6170 : curr = op_iter_next_use (&curri);
4022 : 70882 : curri.i = curri.numops;
4023 : 654706 : do
4024 : : {
4025 : 654706 : path.safe_push (std::make_pair (curri, curr));
4026 : 654706 : tree use = USE_FROM_PTR (curr);
4027 : 654706 : if (use == lookfor)
4028 : : break;
4029 : 584012 : gimple *def = SSA_NAME_DEF_STMT (use);
4030 : 584012 : if (gimple_nop_p (def)
4031 : 584012 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4032 : : {
4033 : 492789 : pop:
4034 : 492789 : do
4035 : : {
4036 : 492789 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4037 : 492789 : curri = x.first;
4038 : 492789 : curr = x.second;
4039 : 537350 : do
4040 : 537350 : curr = op_iter_next_use (&curri);
4041 : : /* Skip already visited or non-SSA operands (from iterating
4042 : : over PHI args). */
4043 : : while (curr != NULL_USE_OPERAND_P
4044 : 1074700 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4045 : 175972 : || ! bitmap_set_bit (visited,
4046 : 175972 : SSA_NAME_VERSION
4047 : : (USE_FROM_PTR (curr)))));
4048 : : }
4049 : 985578 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4050 : 157495 : if (curr == NULL_USE_OPERAND_P)
4051 : : break;
4052 : : }
4053 : : else
4054 : : {
4055 : 491299 : if (gimple_code (def) == GIMPLE_PHI)
4056 : 46530 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4057 : : else
4058 : 444769 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4059 : : while (curr != NULL_USE_OPERAND_P
4060 : 573851 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4061 : 506439 : || ! bitmap_set_bit (visited,
4062 : 506439 : SSA_NAME_VERSION
4063 : : (USE_FROM_PTR (curr)))))
4064 : 82552 : curr = op_iter_next_use (&curri);
4065 : 491299 : if (curr == NULL_USE_OPERAND_P)
4066 : 64782 : goto pop;
4067 : : }
4068 : : }
4069 : : while (1);
4070 : 70882 : if (dump_file && (dump_flags & TDF_DETAILS))
4071 : : {
4072 : 3885 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4073 : 3885 : unsigned i;
4074 : 3885 : std::pair<ssa_op_iter, use_operand_p> *x;
4075 : 13142 : FOR_EACH_VEC_ELT (path, i, x)
4076 : 9257 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4077 : 3885 : dump_printf (MSG_NOTE, "\n");
4078 : : }
4079 : :
4080 : : /* Check whether the reduction path detected is valid. */
4081 : 70882 : bool fail = path.length () == 0;
4082 : 70882 : bool neg = false;
4083 : 70882 : int sign = -1;
4084 : 70882 : *code = ERROR_MARK;
4085 : 307542 : for (unsigned i = 1; i < path.length (); ++i)
4086 : : {
4087 : 85644 : gimple *use_stmt = USE_STMT (path[i].second);
4088 : 85644 : gimple_match_op op;
4089 : 85644 : if (!gimple_extract_op (use_stmt, &op))
4090 : : {
4091 : : fail = true;
4092 : 2755 : break;
4093 : : }
4094 : 85207 : unsigned int opi = op.num_ops;
4095 : 85207 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4096 : : {
4097 : : /* The following make sure we can compute the operand index
4098 : : easily plus it mostly disallows chaining via COND_EXPR condition
4099 : : operands. */
4100 : 124437 : for (opi = 0; opi < op.num_ops; ++opi)
4101 : 123504 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4102 : : break;
4103 : : }
4104 : 3045 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4105 : : {
4106 : 6090 : for (opi = 0; opi < op.num_ops; ++opi)
4107 : 6090 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4108 : : break;
4109 : : }
4110 : 85207 : if (opi == op.num_ops)
4111 : : {
4112 : : fail = true;
4113 : : break;
4114 : : }
4115 : 84274 : op.code = canonicalize_code (op.code, op.type);
4116 : 84274 : if (op.code == MINUS_EXPR)
4117 : : {
4118 : 1879 : op.code = PLUS_EXPR;
4119 : : /* Track whether we negate the reduction value each iteration. */
4120 : 1879 : if (op.ops[1] == op.ops[opi])
4121 : 36 : neg = ! neg;
4122 : : }
4123 : 82395 : else if (op.code == IFN_COND_SUB)
4124 : : {
4125 : 3 : op.code = IFN_COND_ADD;
4126 : : /* Track whether we negate the reduction value each iteration. */
4127 : 3 : if (op.ops[2] == op.ops[opi])
4128 : 0 : neg = ! neg;
4129 : : }
4130 : 84274 : if (CONVERT_EXPR_CODE_P (op.code)
4131 : 84274 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4132 : : ;
4133 : 73932 : else if (*code == ERROR_MARK)
4134 : : {
4135 : 69435 : *code = op.code;
4136 : 69435 : sign = TYPE_SIGN (op.type);
4137 : : }
4138 : 4497 : else if (op.code != *code)
4139 : : {
4140 : : fail = true;
4141 : : break;
4142 : : }
4143 : 3284 : else if ((op.code == MIN_EXPR
4144 : 3140 : || op.code == MAX_EXPR)
4145 : 3291 : && sign != TYPE_SIGN (op.type))
4146 : : {
4147 : : fail = true;
4148 : : break;
4149 : : }
4150 : : /* Check there's only a single stmt the op is used on. For the
4151 : : not value-changing tail and the last stmt allow out-of-loop uses.
4152 : : ??? We could relax this and handle arbitrary live stmts by
4153 : : forcing a scalar epilogue for example. */
4154 : 83058 : imm_use_iterator imm_iter;
4155 : 83058 : use_operand_p use_p;
4156 : 83058 : gimple *op_use_stmt;
4157 : 83058 : unsigned cnt = 0;
4158 : 86101 : bool cond_fn_p = op.code.is_internal_fn ()
4159 : 3043 : && (conditional_internal_fn_code (internal_fn (op.code))
4160 : 83058 : != ERROR_MARK);
4161 : :
4162 : 196849 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4163 : : {
4164 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4165 : : op1 twice (once as definition, once as else) in the same operation.
4166 : : Allow this. */
4167 : 113791 : if (cond_fn_p && op_use_stmt == use_stmt)
4168 : : {
4169 : 3001 : gcall *call = as_a<gcall *> (use_stmt);
4170 : 3001 : unsigned else_pos
4171 : 3001 : = internal_fn_else_index (internal_fn (op.code));
4172 : :
4173 : 15005 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4174 : : {
4175 : 12004 : if (j == else_pos)
4176 : 3001 : continue;
4177 : 9003 : if (gimple_call_arg (call, j) == op.ops[opi])
4178 : 3001 : cnt++;
4179 : : }
4180 : : }
4181 : 110790 : else if (!is_gimple_debug (op_use_stmt)
4182 : 110790 : && (*code != ERROR_MARK
4183 : 5285 : || flow_bb_inside_loop_p (loop,
4184 : 5285 : gimple_bb (op_use_stmt))))
4185 : 160588 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4186 : 80297 : cnt++;
4187 : 83058 : }
4188 : :
4189 : 83058 : if (cnt != 1)
4190 : : {
4191 : : fail = true;
4192 : : break;
4193 : : }
4194 : : }
4195 : 73834 : return ! fail && ! neg && *code != ERROR_MARK;
4196 : 70882 : }
4197 : :
4198 : : bool
4199 : 24 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4200 : : tree loop_arg, enum tree_code code)
4201 : : {
4202 : 24 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4203 : 24 : code_helper code_;
4204 : 24 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4205 : 24 : && code_ == code);
4206 : 24 : }
4207 : :
4208 : :
4209 : :
4210 : : /* Function vect_is_simple_reduction
4211 : :
4212 : : (1) Detect a cross-iteration def-use cycle that represents a simple
4213 : : reduction computation. We look for the following pattern:
4214 : :
4215 : : loop_header:
4216 : : a1 = phi < a0, a2 >
4217 : : a3 = ...
4218 : : a2 = operation (a3, a1)
4219 : :
4220 : : or
4221 : :
4222 : : a3 = ...
4223 : : loop_header:
4224 : : a1 = phi < a0, a2 >
4225 : : a2 = operation (a3, a1)
4226 : :
4227 : : such that:
4228 : : 1. operation is commutative and associative and it is safe to
4229 : : change the order of the computation
4230 : : 2. no uses for a2 in the loop (a2 is used out of the loop)
4231 : : 3. no uses of a1 in the loop besides the reduction operation
4232 : : 4. no uses of a1 outside the loop.
4233 : :
4234 : : Conditions 1,4 are tested here.
4235 : : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4236 : :
4237 : : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4238 : : nested cycles.
4239 : :
4240 : : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4241 : : reductions:
4242 : :
4243 : : a1 = phi < a0, a2 >
4244 : : inner loop (def of a3)
4245 : : a2 = phi < a3 >
4246 : :
4247 : : (4) Detect condition expressions, ie:
4248 : : for (int i = 0; i < N; i++)
4249 : : if (a[i] < val)
4250 : : ret_val = a[i];
4251 : :
4252 : : */
4253 : :
4254 : : static stmt_vec_info
4255 : 90356 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4256 : : bool *double_reduc, bool *reduc_chain_p, bool slp)
4257 : : {
4258 : 90356 : gphi *phi = as_a <gphi *> (phi_info->stmt);
4259 : 90356 : gimple *phi_use_stmt = NULL;
4260 : 90356 : imm_use_iterator imm_iter;
4261 : 90356 : use_operand_p use_p;
4262 : :
4263 : 90356 : *double_reduc = false;
4264 : 90356 : *reduc_chain_p = false;
4265 : 90356 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4266 : :
4267 : 90356 : tree phi_name = PHI_RESULT (phi);
4268 : : /* ??? If there are no uses of the PHI result the inner loop reduction
4269 : : won't be detected as possibly double-reduction by vectorizable_reduction
4270 : : because that tries to walk the PHI arg from the preheader edge which
4271 : : can be constant. See PR60382. */
4272 : 90356 : if (has_zero_uses (phi_name))
4273 : : return NULL;
4274 : 89873 : class loop *loop = (gimple_bb (phi))->loop_father;
4275 : 89873 : unsigned nphi_def_loop_uses = 0;
4276 : 223857 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4277 : : {
4278 : 138706 : gimple *use_stmt = USE_STMT (use_p);
4279 : 138706 : if (is_gimple_debug (use_stmt))
4280 : 35852 : continue;
4281 : :
4282 : 102854 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4283 : : {
4284 : 4722 : if (dump_enabled_p ())
4285 : 53 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4286 : : "intermediate value used outside loop.\n");
4287 : :
4288 : 4722 : return NULL;
4289 : : }
4290 : :
4291 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4292 : : op1 twice (once as definition, once as else) in the same operation.
4293 : : Only count it as one. */
4294 : 98132 : if (use_stmt != phi_use_stmt)
4295 : : {
4296 : 94675 : nphi_def_loop_uses++;
4297 : 94675 : phi_use_stmt = use_stmt;
4298 : : }
4299 : : }
4300 : :
4301 : 85151 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4302 : 85151 : if (TREE_CODE (latch_def) != SSA_NAME)
4303 : : {
4304 : 673 : if (dump_enabled_p ())
4305 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4306 : : "reduction: not ssa_name: %T\n", latch_def);
4307 : 673 : return NULL;
4308 : : }
4309 : :
4310 : 84478 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4311 : 84478 : if (!def_stmt_info
4312 : 84478 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4313 : 137 : return NULL;
4314 : :
4315 : 84341 : bool nested_in_vect_loop
4316 : 84341 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4317 : 84341 : unsigned nlatch_def_loop_uses = 0;
4318 : 84341 : auto_vec<gphi *, 3> lcphis;
4319 : 84341 : bool inner_loop_of_double_reduc = false;
4320 : 329867 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4321 : : {
4322 : 245526 : gimple *use_stmt = USE_STMT (use_p);
4323 : 245526 : if (is_gimple_debug (use_stmt))
4324 : 76852 : continue;
4325 : 168674 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4326 : 91704 : nlatch_def_loop_uses++;
4327 : : else
4328 : : {
4329 : : /* We can have more than one loop-closed PHI. */
4330 : 76970 : lcphis.safe_push (as_a <gphi *> (use_stmt));
4331 : 76970 : if (nested_in_vect_loop
4332 : 76970 : && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4333 : : == vect_double_reduction_def))
4334 : : inner_loop_of_double_reduc = true;
4335 : : }
4336 : : }
4337 : :
4338 : : /* If we are vectorizing an inner reduction we are executing that
4339 : : in the original order only in case we are not dealing with a
4340 : : double reduction. */
4341 : 84341 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4342 : : {
4343 : 1804 : if (dump_enabled_p ())
4344 : 348 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4345 : : "detected nested cycle: ");
4346 : 1804 : return def_stmt_info;
4347 : : }
4348 : :
4349 : : /* When the inner loop of a double reduction ends up with more than
4350 : : one loop-closed PHI we have failed to classify alternate such
4351 : : PHIs as double reduction, leading to wrong code. See PR103237. */
4352 : 83063 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
4353 : : {
4354 : 1 : if (dump_enabled_p ())
4355 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4356 : : "unhandle double reduction\n");
4357 : 1 : return NULL;
4358 : : }
4359 : :
4360 : : /* If this isn't a nested cycle or if the nested cycle reduction value
4361 : : is used ouside of the inner loop we cannot handle uses of the reduction
4362 : : value. */
4363 : 82536 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4364 : : {
4365 : 11047 : if (dump_enabled_p ())
4366 : 326 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4367 : : "reduction used in loop.\n");
4368 : 11047 : return NULL;
4369 : : }
4370 : :
4371 : : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4372 : : defined in the inner loop. */
4373 : 71489 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4374 : : {
4375 : 631 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
4376 : 631 : if (gimple_phi_num_args (def_stmt) != 1
4377 : 631 : || TREE_CODE (op1) != SSA_NAME)
4378 : : {
4379 : 52 : if (dump_enabled_p ())
4380 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4381 : : "unsupported phi node definition.\n");
4382 : :
4383 : 52 : return NULL;
4384 : : }
4385 : :
4386 : : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4387 : : and the latch definition op1. */
4388 : 579 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
4389 : 579 : if (gimple_bb (def1)
4390 : 579 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4391 : 579 : && loop->inner
4392 : 555 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4393 : 555 : && (is_gimple_assign (def1) || is_gimple_call (def1))
4394 : 546 : && is_a <gphi *> (phi_use_stmt)
4395 : 543 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4396 : 1122 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4397 : : loop_latch_edge (loop->inner))))
4398 : : {
4399 : 541 : if (dump_enabled_p ())
4400 : 126 : report_vect_op (MSG_NOTE, def_stmt,
4401 : : "detected double reduction: ");
4402 : :
4403 : 541 : *double_reduc = true;
4404 : 541 : return def_stmt_info;
4405 : : }
4406 : :
4407 : 38 : return NULL;
4408 : : }
4409 : :
4410 : : /* Look for the expression computing latch_def from then loop PHI result. */
4411 : 70858 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4412 : 70858 : code_helper code;
4413 : 70858 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4414 : : path))
4415 : : {
4416 : 67906 : STMT_VINFO_REDUC_CODE (phi_info) = code;
4417 : 67906 : if (code == COND_EXPR && !nested_in_vect_loop)
4418 : 1771 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4419 : :
4420 : : /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4421 : : reduction chain for which the additional restriction is that
4422 : : all operations in the chain are the same. */
4423 : 67906 : auto_vec<stmt_vec_info, 8> reduc_chain;
4424 : 67906 : unsigned i;
4425 : 67906 : bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4426 : 217145 : for (i = path.length () - 1; i >= 1; --i)
4427 : : {
4428 : 81333 : gimple *stmt = USE_STMT (path[i].second);
4429 : 81333 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4430 : 81333 : gimple_match_op op;
4431 : 81333 : if (!gimple_extract_op (stmt, &op))
4432 : 0 : gcc_unreachable ();
4433 : 81333 : if (gassign *assign = dyn_cast<gassign *> (stmt))
4434 : 78294 : STMT_VINFO_REDUC_IDX (stmt_info)
4435 : 78294 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4436 : : else
4437 : : {
4438 : 3039 : gcall *call = as_a<gcall *> (stmt);
4439 : 3039 : STMT_VINFO_REDUC_IDX (stmt_info)
4440 : 3039 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
4441 : : }
4442 : 81333 : bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4443 : 81333 : && (i == 1 || i == path.length () - 1));
4444 : 12045 : if ((op.code != code && !leading_conversion)
4445 : : /* We can only handle the final value in epilogue
4446 : : generation for reduction chains. */
4447 : 91584 : || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4448 : : is_slp_reduc = false;
4449 : : /* For reduction chains we support a trailing/leading
4450 : : conversions. We do not store those in the actual chain. */
4451 : 81333 : if (leading_conversion)
4452 : 10251 : continue;
4453 : 71082 : reduc_chain.safe_push (stmt_info);
4454 : : }
4455 : 131720 : if (slp && is_slp_reduc && reduc_chain.length () > 1)
4456 : : {
4457 : 3476 : for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4458 : : {
4459 : 2682 : REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4460 : 2682 : REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4461 : : }
4462 : 794 : REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4463 : 794 : REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4464 : :
4465 : : /* Save the chain for further analysis in SLP detection. */
4466 : 794 : LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4467 : 1588 : REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4468 : :
4469 : 794 : *reduc_chain_p = true;
4470 : 794 : if (dump_enabled_p ())
4471 : 269 : dump_printf_loc (MSG_NOTE, vect_location,
4472 : : "reduction: detected reduction chain\n");
4473 : : }
4474 : 67112 : else if (dump_enabled_p ())
4475 : 3512 : dump_printf_loc (MSG_NOTE, vect_location,
4476 : : "reduction: detected reduction\n");
4477 : :
4478 : 67906 : return def_stmt_info;
4479 : 67906 : }
4480 : :
4481 : 2952 : if (dump_enabled_p ())
4482 : 103 : dump_printf_loc (MSG_NOTE, vect_location,
4483 : : "reduction: unknown pattern\n");
4484 : :
4485 : : return NULL;
4486 : 155199 : }
4487 : :
4488 : : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4489 : : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4490 : : or -1 if not known. */
4491 : :
4492 : : static int
4493 : 178437 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4494 : : {
4495 : 178437 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
4496 : 178437 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4497 : : {
4498 : 75842 : if (dump_enabled_p ())
4499 : 2110 : dump_printf_loc (MSG_NOTE, vect_location,
4500 : : "cost model: epilogue peel iters set to vf/2 "
4501 : : "because loop iterations are unknown .\n");
4502 : 75842 : return assumed_vf / 2;
4503 : : }
4504 : : else
4505 : : {
4506 : 102595 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4507 : 102595 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
4508 : 102595 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4509 : : /* If we need to peel for gaps, but no peeling is required, we have to
4510 : : peel VF iterations. */
4511 : 102595 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4512 : 102595 : peel_iters_epilogue = assumed_vf;
4513 : 102595 : return peel_iters_epilogue;
4514 : : }
4515 : : }
4516 : :
4517 : : /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4518 : : int
4519 : 140743 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4520 : : int *peel_iters_epilogue,
4521 : : stmt_vector_for_cost *scalar_cost_vec,
4522 : : stmt_vector_for_cost *prologue_cost_vec,
4523 : : stmt_vector_for_cost *epilogue_cost_vec)
4524 : : {
4525 : 140743 : int retval = 0;
4526 : :
4527 : 140743 : *peel_iters_epilogue
4528 : 140743 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4529 : :
4530 : 140743 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4531 : : {
4532 : : /* If peeled iterations are known but number of scalar loop
4533 : : iterations are unknown, count a taken branch per peeled loop. */
4534 : 60522 : if (peel_iters_prologue > 0)
4535 : 28391 : retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4536 : : vect_prologue);
4537 : 60522 : if (*peel_iters_epilogue > 0)
4538 : 60434 : retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4539 : : vect_epilogue);
4540 : : }
4541 : :
4542 : 140743 : stmt_info_for_cost *si;
4543 : 140743 : int j;
4544 : 140743 : if (peel_iters_prologue)
4545 : 245838 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4546 : 197670 : retval += record_stmt_cost (prologue_cost_vec,
4547 : 197670 : si->count * peel_iters_prologue,
4548 : : si->kind, si->stmt_info, si->misalign,
4549 : : vect_prologue);
4550 : 140743 : if (*peel_iters_epilogue)
4551 : 478252 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4552 : 380910 : retval += record_stmt_cost (epilogue_cost_vec,
4553 : 380910 : si->count * *peel_iters_epilogue,
4554 : : si->kind, si->stmt_info, si->misalign,
4555 : : vect_epilogue);
4556 : :
4557 : 140743 : return retval;
4558 : : }
4559 : :
4560 : : /* Function vect_estimate_min_profitable_iters
4561 : :
4562 : : Return the number of iterations required for the vector version of the
4563 : : loop to be profitable relative to the cost of the scalar version of the
4564 : : loop.
4565 : :
4566 : : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4567 : : of iterations for vectorization. -1 value means loop vectorization
4568 : : is not profitable. This returned value may be used for dynamic
4569 : : profitability check.
4570 : :
4571 : : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4572 : : for static check against estimated number of iterations. */
4573 : :
4574 : : static void
4575 : 53221 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4576 : : int *ret_min_profitable_niters,
4577 : : int *ret_min_profitable_estimate,
4578 : : unsigned *suggested_unroll_factor)
4579 : : {
4580 : 53221 : int min_profitable_iters;
4581 : 53221 : int min_profitable_estimate;
4582 : 53221 : int peel_iters_prologue;
4583 : 53221 : int peel_iters_epilogue;
4584 : 53221 : unsigned vec_inside_cost = 0;
4585 : 53221 : int vec_outside_cost = 0;
4586 : 53221 : unsigned vec_prologue_cost = 0;
4587 : 53221 : unsigned vec_epilogue_cost = 0;
4588 : 53221 : int scalar_single_iter_cost = 0;
4589 : 53221 : int scalar_outside_cost = 0;
4590 : 53221 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
4591 : 53221 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4592 : 53221 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
4593 : :
4594 : : /* Cost model disabled. */
4595 : 53221 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4596 : : {
4597 : 15510 : if (dump_enabled_p ())
4598 : 9351 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4599 : 15510 : *ret_min_profitable_niters = 0;
4600 : 15510 : *ret_min_profitable_estimate = 0;
4601 : 20678 : return;
4602 : : }
4603 : :
4604 : : /* Requires loop versioning tests to handle misalignment. */
4605 : 37711 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4606 : : {
4607 : : /* FIXME: Make cost depend on complexity of individual check. */
4608 : 0 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4609 : 0 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4610 : 0 : if (dump_enabled_p ())
4611 : 0 : dump_printf (MSG_NOTE,
4612 : : "cost model: Adding cost of checks for loop "
4613 : : "versioning to treat misalignment.\n");
4614 : : }
4615 : :
4616 : : /* Requires loop versioning with alias checks. */
4617 : 37711 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4618 : : {
4619 : : /* FIXME: Make cost depend on complexity of individual check. */
4620 : 2757 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4621 : 2757 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4622 : 2757 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4623 : 0 : if (len)
4624 : : /* Count LEN - 1 ANDs and LEN comparisons. */
4625 : 0 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4626 : : scalar_stmt, vect_prologue);
4627 : 2757 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4628 : 563 : if (len)
4629 : : {
4630 : : /* Count LEN - 1 ANDs and LEN comparisons. */
4631 : 563 : unsigned int nstmts = len * 2 - 1;
4632 : : /* +1 for each bias that needs adding. */
4633 : 1126 : for (unsigned int i = 0; i < len; ++i)
4634 : 563 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4635 : 73 : nstmts += 1;
4636 : 563 : (void) add_stmt_cost (target_cost_data, nstmts,
4637 : : scalar_stmt, vect_prologue);
4638 : : }
4639 : 2757 : if (dump_enabled_p ())
4640 : 11 : dump_printf (MSG_NOTE,
4641 : : "cost model: Adding cost of checks for loop "
4642 : : "versioning aliasing.\n");
4643 : : }
4644 : :
4645 : : /* Requires loop versioning with niter checks. */
4646 : 37711 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4647 : : {
4648 : : /* FIXME: Make cost depend on complexity of individual check. */
4649 : 90 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4650 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4651 : 90 : if (dump_enabled_p ())
4652 : 1 : dump_printf (MSG_NOTE,
4653 : : "cost model: Adding cost of checks for loop "
4654 : : "versioning niters.\n");
4655 : : }
4656 : :
4657 : 37711 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4658 : 2847 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4659 : : vect_prologue);
4660 : :
4661 : : /* Count statements in scalar loop. Using this as scalar cost for a single
4662 : : iteration for now.
4663 : :
4664 : : TODO: Add outer loop support.
4665 : :
4666 : : TODO: Consider assigning different costs to different scalar
4667 : : statements. */
4668 : :
4669 : 37711 : scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4670 : :
4671 : : /* Add additional cost for the peeled instructions in prologue and epilogue
4672 : : loop. (For fully-masked loops there will be no peeling.)
4673 : :
4674 : : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4675 : : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4676 : :
4677 : : TODO: Build an expression that represents peel_iters for prologue and
4678 : : epilogue to be used in a run-time test. */
4679 : :
4680 : 37711 : bool prologue_need_br_taken_cost = false;
4681 : 37711 : bool prologue_need_br_not_taken_cost = false;
4682 : :
4683 : : /* Calculate peel_iters_prologue. */
4684 : 37711 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4685 : : peel_iters_prologue = 0;
4686 : 37711 : else if (npeel < 0)
4687 : : {
4688 : 9 : peel_iters_prologue = assumed_vf / 2;
4689 : 9 : if (dump_enabled_p ())
4690 : 3 : dump_printf (MSG_NOTE, "cost model: "
4691 : : "prologue peel iters set to vf/2.\n");
4692 : :
4693 : : /* If peeled iterations are unknown, count a taken branch and a not taken
4694 : : branch per peeled loop. Even if scalar loop iterations are known,
4695 : : vector iterations are not known since peeled prologue iterations are
4696 : : not known. Hence guards remain the same. */
4697 : : prologue_need_br_taken_cost = true;
4698 : : prologue_need_br_not_taken_cost = true;
4699 : : }
4700 : : else
4701 : : {
4702 : 37702 : peel_iters_prologue = npeel;
4703 : 37702 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4704 : : /* If peeled iterations are known but number of scalar loop
4705 : : iterations are unknown, count a taken branch per peeled loop. */
4706 : 37711 : prologue_need_br_taken_cost = true;
4707 : : }
4708 : :
4709 : 37711 : bool epilogue_need_br_taken_cost = false;
4710 : 37711 : bool epilogue_need_br_not_taken_cost = false;
4711 : :
4712 : : /* Calculate peel_iters_epilogue. */
4713 : 37711 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4714 : : /* We need to peel exactly one iteration for gaps. */
4715 : 8 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4716 : 37703 : else if (npeel < 0)
4717 : : {
4718 : : /* If peeling for alignment is unknown, loop bound of main loop
4719 : : becomes unknown. */
4720 : 9 : peel_iters_epilogue = assumed_vf / 2;
4721 : 9 : if (dump_enabled_p ())
4722 : 3 : dump_printf (MSG_NOTE, "cost model: "
4723 : : "epilogue peel iters set to vf/2 because "
4724 : : "peeling for alignment is unknown.\n");
4725 : :
4726 : : /* See the same reason above in peel_iters_prologue calculation. */
4727 : : epilogue_need_br_taken_cost = true;
4728 : : epilogue_need_br_not_taken_cost = true;
4729 : : }
4730 : : else
4731 : : {
4732 : 37694 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4733 : 37694 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4734 : : /* If peeled iterations are known but number of scalar loop
4735 : : iterations are unknown, count a taken branch per peeled loop. */
4736 : 37711 : epilogue_need_br_taken_cost = true;
4737 : : }
4738 : :
4739 : 37711 : stmt_info_for_cost *si;
4740 : 37711 : int j;
4741 : : /* Add costs associated with peel_iters_prologue. */
4742 : 37711 : if (peel_iters_prologue)
4743 : 69 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4744 : : {
4745 : 54 : (void) add_stmt_cost (target_cost_data,
4746 : 54 : si->count * peel_iters_prologue, si->kind,
4747 : : si->stmt_info, si->node, si->vectype,
4748 : : si->misalign, vect_prologue);
4749 : : }
4750 : :
4751 : : /* Add costs associated with peel_iters_epilogue. */
4752 : 37711 : if (peel_iters_epilogue)
4753 : 118666 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4754 : : {
4755 : 101900 : (void) add_stmt_cost (target_cost_data,
4756 : 101900 : si->count * peel_iters_epilogue, si->kind,
4757 : : si->stmt_info, si->node, si->vectype,
4758 : : si->misalign, vect_epilogue);
4759 : : }
4760 : :
4761 : : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4762 : :
4763 : 37711 : if (prologue_need_br_taken_cost)
4764 : 9 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4765 : : vect_prologue);
4766 : :
4767 : 37711 : if (prologue_need_br_not_taken_cost)
4768 : 9 : (void) add_stmt_cost (target_cost_data, 1,
4769 : : cond_branch_not_taken, vect_prologue);
4770 : :
4771 : 37711 : if (epilogue_need_br_taken_cost)
4772 : 14656 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4773 : : vect_epilogue);
4774 : :
4775 : 37711 : if (epilogue_need_br_not_taken_cost)
4776 : 9 : (void) add_stmt_cost (target_cost_data, 1,
4777 : : cond_branch_not_taken, vect_epilogue);
4778 : :
4779 : : /* Take care of special costs for rgroup controls of partial vectors. */
4780 : 8 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4781 : 37719 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4782 : : == vect_partial_vectors_avx512))
4783 : : {
4784 : : /* Calculate how many masks we need to generate. */
4785 : 8 : unsigned int num_masks = 0;
4786 : 8 : bool need_saturation = false;
4787 : 34 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4788 : 10 : if (rgm.type)
4789 : : {
4790 : 8 : unsigned nvectors = rgm.factor;
4791 : 8 : num_masks += nvectors;
4792 : 8 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4793 : 8 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4794 : 0 : need_saturation = true;
4795 : : }
4796 : :
4797 : : /* ??? The target isn't able to identify the costs below as
4798 : : producing masks so it cannot penaltize cases where we'd run
4799 : : out of mask registers for example. */
4800 : :
4801 : : /* ??? We are also failing to account for smaller vector masks
4802 : : we generate by splitting larger masks in vect_get_loop_mask. */
4803 : :
4804 : : /* In the worst case, we need to generate each mask in the prologue
4805 : : and in the loop body. We need one splat per group and one
4806 : : compare per mask.
4807 : :
4808 : : Sometimes the prologue mask will fold to a constant,
4809 : : so the actual prologue cost might be smaller. However, it's
4810 : : simpler and safer to use the worst-case cost; if this ends up
4811 : : being the tie-breaker between vectorizing or not, then it's
4812 : : probably better not to vectorize. */
4813 : 16 : (void) add_stmt_cost (target_cost_data,
4814 : : num_masks
4815 : 16 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4816 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4817 : : vect_prologue);
4818 : 16 : (void) add_stmt_cost (target_cost_data,
4819 : : num_masks
4820 : 16 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4821 : : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4822 : :
4823 : : /* When we need saturation we need it both in the prologue and
4824 : : the epilogue. */
4825 : 8 : if (need_saturation)
4826 : : {
4827 : 0 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4828 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4829 : 0 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4830 : : NULL, NULL, NULL_TREE, 0, vect_body);
4831 : : }
4832 : : }
4833 : 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4834 : 37703 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4835 : : == vect_partial_vectors_while_ult))
4836 : : {
4837 : : /* Calculate how many masks we need to generate. */
4838 : : unsigned int num_masks = 0;
4839 : : rgroup_controls *rgm;
4840 : : unsigned int num_vectors_m1;
4841 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4842 : : num_vectors_m1, rgm)
4843 : 0 : if (rgm->type)
4844 : 0 : num_masks += num_vectors_m1 + 1;
4845 : 0 : gcc_assert (num_masks > 0);
4846 : :
4847 : : /* In the worst case, we need to generate each mask in the prologue
4848 : : and in the loop body. One of the loop body mask instructions
4849 : : replaces the comparison in the scalar loop, and since we don't
4850 : : count the scalar comparison against the scalar body, we shouldn't
4851 : : count that vector instruction against the vector body either.
4852 : :
4853 : : Sometimes we can use unpacks instead of generating prologue
4854 : : masks and sometimes the prologue mask will fold to a constant,
4855 : : so the actual prologue cost might be smaller. However, it's
4856 : : simpler and safer to use the worst-case cost; if this ends up
4857 : : being the tie-breaker between vectorizing or not, then it's
4858 : : probably better not to vectorize. */
4859 : 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4860 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4861 : : vect_prologue);
4862 : 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4863 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4864 : : vect_body);
4865 : : }
4866 : 37703 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4867 : : {
4868 : : /* Referring to the functions vect_set_loop_condition_partial_vectors
4869 : : and vect_set_loop_controls_directly, we need to generate each
4870 : : length in the prologue and in the loop body if required. Although
4871 : : there are some possible optimizations, we consider the worst case
4872 : : here. */
4873 : :
4874 : 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4875 : 0 : signed char partial_load_store_bias
4876 : : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4877 : 0 : bool need_iterate_p
4878 : 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4879 : 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4880 : :
4881 : : /* Calculate how many statements to be added. */
4882 : 0 : unsigned int prologue_stmts = 0;
4883 : 0 : unsigned int body_stmts = 0;
4884 : :
4885 : 0 : rgroup_controls *rgc;
4886 : 0 : unsigned int num_vectors_m1;
4887 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4888 : 0 : if (rgc->type)
4889 : : {
4890 : : /* May need one SHIFT for nitems_total computation. */
4891 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4892 : 0 : if (nitems != 1 && !niters_known_p)
4893 : 0 : prologue_stmts += 1;
4894 : :
4895 : : /* May need one MAX and one MINUS for wrap around. */
4896 : 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4897 : 0 : prologue_stmts += 2;
4898 : :
4899 : : /* Need one MAX and one MINUS for each batch limit excepting for
4900 : : the 1st one. */
4901 : 0 : prologue_stmts += num_vectors_m1 * 2;
4902 : :
4903 : 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4904 : :
4905 : : /* Need to set up lengths in prologue, only one MIN required
4906 : : for each since start index is zero. */
4907 : 0 : prologue_stmts += num_vectors;
4908 : :
4909 : : /* If we have a non-zero partial load bias, we need one PLUS
4910 : : to adjust the load length. */
4911 : 0 : if (partial_load_store_bias != 0)
4912 : 0 : body_stmts += 1;
4913 : :
4914 : 0 : unsigned int length_update_cost = 0;
4915 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4916 : : /* For decrement IV style, Each only need a single SELECT_VL
4917 : : or MIN since beginning to calculate the number of elements
4918 : : need to be processed in current iteration. */
4919 : : length_update_cost = 1;
4920 : : else
4921 : : /* For increment IV stype, Each may need two MINs and one MINUS to
4922 : : update lengths in body for next iteration. */
4923 : 0 : length_update_cost = 3;
4924 : :
4925 : 0 : if (need_iterate_p)
4926 : 0 : body_stmts += length_update_cost * num_vectors;
4927 : : }
4928 : :
4929 : 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4930 : : scalar_stmt, vect_prologue);
4931 : 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4932 : : scalar_stmt, vect_body);
4933 : : }
4934 : :
4935 : : /* FORNOW: The scalar outside cost is incremented in one of the
4936 : : following ways:
4937 : :
4938 : : 1. The vectorizer checks for alignment and aliasing and generates
4939 : : a condition that allows dynamic vectorization. A cost model
4940 : : check is ANDED with the versioning condition. Hence scalar code
4941 : : path now has the added cost of the versioning check.
4942 : :
4943 : : if (cost > th & versioning_check)
4944 : : jmp to vector code
4945 : :
4946 : : Hence run-time scalar is incremented by not-taken branch cost.
4947 : :
4948 : : 2. The vectorizer then checks if a prologue is required. If the
4949 : : cost model check was not done before during versioning, it has to
4950 : : be done before the prologue check.
4951 : :
4952 : : if (cost <= th)
4953 : : prologue = scalar_iters
4954 : : if (prologue == 0)
4955 : : jmp to vector code
4956 : : else
4957 : : execute prologue
4958 : : if (prologue == num_iters)
4959 : : go to exit
4960 : :
4961 : : Hence the run-time scalar cost is incremented by a taken branch,
4962 : : plus a not-taken branch, plus a taken branch cost.
4963 : :
4964 : : 3. The vectorizer then checks if an epilogue is required. If the
4965 : : cost model check was not done before during prologue check, it
4966 : : has to be done with the epilogue check.
4967 : :
4968 : : if (prologue == 0)
4969 : : jmp to vector code
4970 : : else
4971 : : execute prologue
4972 : : if (prologue == num_iters)
4973 : : go to exit
4974 : : vector code:
4975 : : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4976 : : jmp to epilogue
4977 : :
4978 : : Hence the run-time scalar cost should be incremented by 2 taken
4979 : : branches.
4980 : :
4981 : : TODO: The back end may reorder the BBS's differently and reverse
4982 : : conditions/branch directions. Change the estimates below to
4983 : : something more reasonable. */
4984 : :
4985 : : /* If the number of iterations is known and we do not do versioning, we can
4986 : : decide whether to vectorize at compile time. Hence the scalar version
4987 : : do not carry cost model guard costs. */
4988 : 22386 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4989 : 60097 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4990 : : {
4991 : : /* Cost model check occurs at versioning. */
4992 : 15664 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4993 : 2847 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4994 : : else
4995 : : {
4996 : : /* Cost model check occurs at prologue generation. */
4997 : 12817 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4998 : 2 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4999 : 2 : + vect_get_stmt_cost (cond_branch_not_taken);
5000 : : /* Cost model check occurs at epilogue generation. */
5001 : : else
5002 : 12815 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
5003 : : }
5004 : : }
5005 : :
5006 : : /* Complete the target-specific cost calculations. */
5007 : 37711 : finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5008 : : &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5009 : : suggested_unroll_factor);
5010 : :
5011 : 37711 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
5012 : 0 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5013 : 37711 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5014 : : *suggested_unroll_factor,
5015 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5016 : : {
5017 : 0 : if (dump_enabled_p ())
5018 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5019 : : "can't unroll as unrolled vectorization factor larger"
5020 : : " than maximum vectorization factor: "
5021 : : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5022 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5023 : 0 : *suggested_unroll_factor = 1;
5024 : : }
5025 : :
5026 : 37711 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5027 : :
5028 : 37711 : if (dump_enabled_p ())
5029 : : {
5030 : 330 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5031 : 330 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
5032 : : vec_inside_cost);
5033 : 330 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
5034 : : vec_prologue_cost);
5035 : 330 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
5036 : : vec_epilogue_cost);
5037 : 330 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
5038 : : scalar_single_iter_cost);
5039 : 330 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
5040 : : scalar_outside_cost);
5041 : 330 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
5042 : : vec_outside_cost);
5043 : 330 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
5044 : : peel_iters_prologue);
5045 : 330 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
5046 : : peel_iters_epilogue);
5047 : : }
5048 : :
5049 : : /* Calculate number of iterations required to make the vector version
5050 : : profitable, relative to the loop bodies only. The following condition
5051 : : must hold true:
5052 : : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5053 : : where
5054 : : SIC = scalar iteration cost, VIC = vector iteration cost,
5055 : : VOC = vector outside cost, VF = vectorization factor,
5056 : : NPEEL = prologue iterations + epilogue iterations,
5057 : : SOC = scalar outside cost for run time cost model check. */
5058 : :
5059 : 37711 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5060 : 37711 : - vec_inside_cost);
5061 : 37711 : if (saving_per_viter <= 0)
5062 : : {
5063 : 5168 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5064 : 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5065 : : "vectorization did not happen for a simd loop");
5066 : :
5067 : 5168 : if (dump_enabled_p ())
5068 : 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5069 : : "cost model: the vector iteration cost = %d "
5070 : : "divided by the scalar iteration cost = %d "
5071 : : "is greater or equal to the vectorization factor = %d"
5072 : : ".\n",
5073 : : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5074 : 5168 : *ret_min_profitable_niters = -1;
5075 : 5168 : *ret_min_profitable_estimate = -1;
5076 : 5168 : return;
5077 : : }
5078 : :
5079 : : /* ??? The "if" arm is written to handle all cases; see below for what
5080 : : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5081 : 32543 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5082 : : {
5083 : : /* Rewriting the condition above in terms of the number of
5084 : : vector iterations (vniters) rather than the number of
5085 : : scalar iterations (niters) gives:
5086 : :
5087 : : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5088 : :
5089 : : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5090 : :
5091 : : For integer N, X and Y when X > 0:
5092 : :
5093 : : N * X > Y <==> N >= (Y /[floor] X) + 1. */
5094 : 5 : int outside_overhead = (vec_outside_cost
5095 : 5 : - scalar_single_iter_cost * peel_iters_prologue
5096 : 5 : - scalar_single_iter_cost * peel_iters_epilogue
5097 : : - scalar_outside_cost);
5098 : : /* We're only interested in cases that require at least one
5099 : : vector iteration. */
5100 : 5 : int min_vec_niters = 1;
5101 : 5 : if (outside_overhead > 0)
5102 : 4 : min_vec_niters = outside_overhead / saving_per_viter + 1;
5103 : :
5104 : 5 : if (dump_enabled_p ())
5105 : 0 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5106 : : min_vec_niters);
5107 : :
5108 : 5 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5109 : : {
5110 : : /* Now that we know the minimum number of vector iterations,
5111 : : find the minimum niters for which the scalar cost is larger:
5112 : :
5113 : : SIC * niters > VIC * vniters + VOC - SOC
5114 : :
5115 : : We know that the minimum niters is no more than
5116 : : vniters * VF + NPEEL, but it might be (and often is) less
5117 : : than that if a partial vector iteration is cheaper than the
5118 : : equivalent scalar code. */
5119 : 5 : int threshold = (vec_inside_cost * min_vec_niters
5120 : 5 : + vec_outside_cost
5121 : 5 : - scalar_outside_cost);
5122 : 5 : if (threshold <= 0)
5123 : : min_profitable_iters = 1;
5124 : : else
5125 : 5 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5126 : : }
5127 : : else
5128 : : /* Convert the number of vector iterations into a number of
5129 : : scalar iterations. */
5130 : 0 : min_profitable_iters = (min_vec_niters * assumed_vf
5131 : 0 : + peel_iters_prologue
5132 : : + peel_iters_epilogue);
5133 : : }
5134 : : else
5135 : : {
5136 : 32538 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5137 : 32538 : * assumed_vf
5138 : 32538 : - vec_inside_cost * peel_iters_prologue
5139 : 32538 : - vec_inside_cost * peel_iters_epilogue);
5140 : 32538 : if (min_profitable_iters <= 0)
5141 : : min_profitable_iters = 0;
5142 : : else
5143 : : {
5144 : 25463 : min_profitable_iters /= saving_per_viter;
5145 : :
5146 : 25463 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5147 : 25463 : <= (((int) vec_inside_cost * min_profitable_iters)
5148 : 25463 : + (((int) vec_outside_cost - scalar_outside_cost)
5149 : : * assumed_vf)))
5150 : 25463 : min_profitable_iters++;
5151 : : }
5152 : : }
5153 : :
5154 : 32543 : if (dump_enabled_p ())
5155 : 323 : dump_printf (MSG_NOTE,
5156 : : " Calculated minimum iters for profitability: %d\n",
5157 : : min_profitable_iters);
5158 : :
5159 : 32543 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5160 : 32538 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5161 : : /* We want the vectorized loop to execute at least once. */
5162 : : min_profitable_iters = assumed_vf + peel_iters_prologue;
5163 : 3804 : else if (min_profitable_iters < peel_iters_prologue)
5164 : : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5165 : : vectorized loop executes at least once. */
5166 : : min_profitable_iters = peel_iters_prologue;
5167 : :
5168 : 32543 : if (dump_enabled_p ())
5169 : 323 : dump_printf_loc (MSG_NOTE, vect_location,
5170 : : " Runtime profitability threshold = %d\n",
5171 : : min_profitable_iters);
5172 : :
5173 : 32543 : *ret_min_profitable_niters = min_profitable_iters;
5174 : :
5175 : : /* Calculate number of iterations required to make the vector version
5176 : : profitable, relative to the loop bodies only.
5177 : :
5178 : : Non-vectorized variant is SIC * niters and it must win over vector
5179 : : variant on the expected loop trip count. The following condition must hold true:
5180 : : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5181 : :
5182 : 32543 : if (vec_outside_cost <= 0)
5183 : : min_profitable_estimate = 0;
5184 : : /* ??? This "else if" arm is written to handle all cases; see below for
5185 : : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5186 : 27784 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5187 : : {
5188 : : /* This is a repeat of the code above, but with + SOC rather
5189 : : than - SOC. */
5190 : 5 : int outside_overhead = (vec_outside_cost
5191 : 5 : - scalar_single_iter_cost * peel_iters_prologue
5192 : 5 : - scalar_single_iter_cost * peel_iters_epilogue
5193 : : + scalar_outside_cost);
5194 : 5 : int min_vec_niters = 1;
5195 : 5 : if (outside_overhead > 0)
5196 : 5 : min_vec_niters = outside_overhead / saving_per_viter + 1;
5197 : :
5198 : 5 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5199 : : {
5200 : 5 : int threshold = (vec_inside_cost * min_vec_niters
5201 : 5 : + vec_outside_cost
5202 : 5 : + scalar_outside_cost);
5203 : 5 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5204 : : }
5205 : : else
5206 : : min_profitable_estimate = (min_vec_niters * assumed_vf
5207 : : + peel_iters_prologue
5208 : : + peel_iters_epilogue);
5209 : : }
5210 : : else
5211 : : {
5212 : 27779 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5213 : 27779 : * assumed_vf
5214 : 27779 : - vec_inside_cost * peel_iters_prologue
5215 : 27779 : - vec_inside_cost * peel_iters_epilogue)
5216 : 27779 : / ((scalar_single_iter_cost * assumed_vf)
5217 : : - vec_inside_cost);
5218 : : }
5219 : 32543 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5220 : 32543 : if (dump_enabled_p ())
5221 : 323 : dump_printf_loc (MSG_NOTE, vect_location,
5222 : : " Static estimate profitability threshold = %d\n",
5223 : : min_profitable_estimate);
5224 : :
5225 : 32543 : *ret_min_profitable_estimate = min_profitable_estimate;
5226 : : }
5227 : :
5228 : : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5229 : : vector elements (not bits) for a vector with NELT elements. */
5230 : : static void
5231 : 1878 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5232 : : vec_perm_builder *sel)
5233 : : {
5234 : : /* The encoding is a single stepped pattern. Any wrap-around is handled
5235 : : by vec_perm_indices. */
5236 : 1878 : sel->new_vector (nelt, 1, 3);
5237 : 7512 : for (unsigned int i = 0; i < 3; i++)
5238 : 5634 : sel->quick_push (i + offset);
5239 : 1878 : }
5240 : :
5241 : : /* Checks whether the target supports whole-vector shifts for vectors of mode
5242 : : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5243 : : it supports vec_perm_const with masks for all necessary shift amounts. */
5244 : : static bool
5245 : 7087 : have_whole_vector_shift (machine_mode mode)
5246 : : {
5247 : 7087 : if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5248 : : return true;
5249 : :
5250 : : /* Variable-length vectors should be handled via the optab. */
5251 : 62 : unsigned int nelt;
5252 : 124 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5253 : : return false;
5254 : :
5255 : 62 : vec_perm_builder sel;
5256 : 62 : vec_perm_indices indices;
5257 : 313 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5258 : : {
5259 : 251 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
5260 : 251 : indices.new_vector (sel, 2, nelt);
5261 : 251 : if (!can_vec_perm_const_p (mode, mode, indices, false))
5262 : : return false;
5263 : : }
5264 : : return true;
5265 : 62 : }
5266 : :
5267 : : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5268 : : multiplication operands have differing signs and (b) we intend
5269 : : to emulate the operation using a series of signed DOT_PROD_EXPRs.
5270 : : See vect_emulate_mixed_dot_prod for the actual sequence used. */
5271 : :
5272 : : static bool
5273 : 57593 : vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5274 : : stmt_vec_info stmt_info)
5275 : : {
5276 : 57593 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5277 : 53652 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5278 : : return false;
5279 : :
5280 : 427 : tree rhs1 = gimple_assign_rhs1 (assign);
5281 : 427 : tree rhs2 = gimple_assign_rhs2 (assign);
5282 : 427 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5283 : : return false;
5284 : :
5285 : 122 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5286 : 122 : gcc_assert (reduc_info->is_reduc_info);
5287 : 122 : return !directly_supported_p (DOT_PROD_EXPR,
5288 : : STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5289 : 122 : optab_vector_mixed_sign);
5290 : : }
5291 : :
5292 : : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5293 : : functions. Design better to avoid maintenance issues. */
5294 : :
5295 : : /* Function vect_model_reduction_cost.
5296 : :
5297 : : Models cost for a reduction operation, including the vector ops
5298 : : generated within the strip-mine loop in some cases, the initial
5299 : : definition before the loop, and the epilogue code that must be generated. */
5300 : :
5301 : : static void
5302 : 52251 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
5303 : : stmt_vec_info stmt_info, internal_fn reduc_fn,
5304 : : vect_reduction_type reduction_type,
5305 : : int ncopies, stmt_vector_for_cost *cost_vec)
5306 : : {
5307 : 52251 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5308 : 52251 : tree vectype;
5309 : 52251 : machine_mode mode;
5310 : 52251 : class loop *loop = NULL;
5311 : :
5312 : 52251 : if (loop_vinfo)
5313 : 52251 : loop = LOOP_VINFO_LOOP (loop_vinfo);
5314 : :
5315 : : /* Condition reductions generate two reductions in the loop. */
5316 : 52251 : if (reduction_type == COND_REDUCTION)
5317 : 222 : ncopies *= 2;
5318 : :
5319 : 52251 : vectype = STMT_VINFO_VECTYPE (stmt_info);
5320 : 52251 : mode = TYPE_MODE (vectype);
5321 : 52251 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5322 : :
5323 : 52251 : gimple_match_op op;
5324 : 52251 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5325 : 0 : gcc_unreachable ();
5326 : :
5327 : 52251 : bool emulated_mixed_dot_prod
5328 : 52251 : = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5329 : 52251 : if (reduction_type == EXTRACT_LAST_REDUCTION)
5330 : : /* No extra instructions are needed in the prologue. The loop body
5331 : : operations are costed in vectorizable_condition. */
5332 : : inside_cost = 0;
5333 : 52251 : else if (reduction_type == FOLD_LEFT_REDUCTION)
5334 : : {
5335 : : /* No extra instructions needed in the prologue. */
5336 : 2379 : prologue_cost = 0;
5337 : :
5338 : 2379 : if (reduc_fn != IFN_LAST)
5339 : : /* Count one reduction-like operation per vector. */
5340 : 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5341 : : stmt_info, 0, vect_body);
5342 : : else
5343 : : {
5344 : : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5345 : 2379 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5346 : 2379 : inside_cost = record_stmt_cost (cost_vec, nelements,
5347 : : vec_to_scalar, stmt_info, 0,
5348 : : vect_body);
5349 : 2379 : inside_cost += record_stmt_cost (cost_vec, nelements,
5350 : : scalar_stmt, stmt_info, 0,
5351 : : vect_body);
5352 : : }
5353 : : }
5354 : : else
5355 : : {
5356 : : /* Add in the cost of the initial definitions. */
5357 : 49872 : int prologue_stmts;
5358 : 49872 : if (reduction_type == COND_REDUCTION)
5359 : : /* For cond reductions we have four vectors: initial index, step,
5360 : : initial result of the data reduction, initial value of the index
5361 : : reduction. */
5362 : : prologue_stmts = 4;
5363 : 49650 : else if (emulated_mixed_dot_prod)
5364 : : /* We need the initial reduction value and two invariants:
5365 : : one that contains the minimum signed value and one that
5366 : : contains half of its negative. */
5367 : : prologue_stmts = 3;
5368 : : else
5369 : 49614 : prologue_stmts = 1;
5370 : 49872 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5371 : : scalar_to_vec, stmt_info, 0,
5372 : : vect_prologue);
5373 : : }
5374 : :
5375 : : /* Determine cost of epilogue code.
5376 : :
5377 : : We have a reduction operator that will reduce the vector in one statement.
5378 : : Also requires scalar extract. */
5379 : :
5380 : 52251 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5381 : : {
5382 : 52157 : if (reduc_fn != IFN_LAST)
5383 : : {
5384 : 41916 : if (reduction_type == COND_REDUCTION)
5385 : : {
5386 : : /* An EQ stmt and an COND_EXPR stmt. */
5387 : 9 : epilogue_cost += record_stmt_cost (cost_vec, 2,
5388 : : vector_stmt, stmt_info, 0,
5389 : : vect_epilogue);
5390 : : /* Reduction of the max index and a reduction of the found
5391 : : values. */
5392 : 9 : epilogue_cost += record_stmt_cost (cost_vec, 2,
5393 : : vec_to_scalar, stmt_info, 0,
5394 : : vect_epilogue);
5395 : : /* A broadcast of the max value. */
5396 : 9 : epilogue_cost += record_stmt_cost (cost_vec, 1,
5397 : : scalar_to_vec, stmt_info, 0,
5398 : : vect_epilogue);
5399 : : }
5400 : : else
5401 : : {
5402 : 41907 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5403 : : stmt_info, 0, vect_epilogue);
5404 : 41907 : epilogue_cost += record_stmt_cost (cost_vec, 1,
5405 : : vec_to_scalar, stmt_info, 0,
5406 : : vect_epilogue);
5407 : : }
5408 : : }
5409 : 10241 : else if (reduction_type == COND_REDUCTION)
5410 : : {
5411 : 213 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5412 : : /* Extraction of scalar elements. */
5413 : 426 : epilogue_cost += record_stmt_cost (cost_vec,
5414 : 213 : 2 * estimated_nunits,
5415 : : vec_to_scalar, stmt_info, 0,
5416 : : vect_epilogue);
5417 : : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5418 : 213 : epilogue_cost += record_stmt_cost (cost_vec,
5419 : 213 : 2 * estimated_nunits - 3,
5420 : : scalar_stmt, stmt_info, 0,
5421 : : vect_epilogue);
5422 : : }
5423 : 10028 : else if (reduction_type == EXTRACT_LAST_REDUCTION
5424 : 10028 : || reduction_type == FOLD_LEFT_REDUCTION)
5425 : : /* No extra instructions need in the epilogue. */
5426 : : ;
5427 : : else
5428 : : {
5429 : 7649 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5430 : 7649 : tree bitsize = TYPE_SIZE (op.type);
5431 : 7649 : int element_bitsize = tree_to_uhwi (bitsize);
5432 : 7649 : int nelements = vec_size_in_bits / element_bitsize;
5433 : :
5434 : 7649 : if (op.code == COND_EXPR)
5435 : 22 : op.code = MAX_EXPR;
5436 : :
5437 : : /* We have a whole vector shift available. */
5438 : 744 : if (VECTOR_MODE_P (mode)
5439 : 7649 : && directly_supported_p (op.code, vectype)
5440 : 12838 : && have_whole_vector_shift (mode))
5441 : : {
5442 : : /* Final reduction via vector shifts and the reduction operator.
5443 : : Also requires scalar extract. */
5444 : 15567 : epilogue_cost += record_stmt_cost (cost_vec,
5445 : 10378 : exact_log2 (nelements) * 2,
5446 : : vector_stmt, stmt_info, 0,
5447 : : vect_epilogue);
5448 : 5189 : epilogue_cost += record_stmt_cost (cost_vec, 1,
5449 : : vec_to_scalar, stmt_info, 0,
5450 : : vect_epilogue);
5451 : : }
5452 : : else
5453 : : /* Use extracts and reduction op for final reduction. For N
5454 : : elements, we have N extracts and N-1 reduction ops. */
5455 : 2460 : epilogue_cost += record_stmt_cost (cost_vec,
5456 : 2460 : nelements + nelements - 1,
5457 : : vector_stmt, stmt_info, 0,
5458 : : vect_epilogue);
5459 : : }
5460 : : }
5461 : :
5462 : 52251 : if (dump_enabled_p ())
5463 : 2953 : dump_printf (MSG_NOTE,
5464 : : "vect_model_reduction_cost: inside_cost = %d, "
5465 : : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5466 : : prologue_cost, epilogue_cost);
5467 : 52251 : }
5468 : :
5469 : : /* SEQ is a sequence of instructions that initialize the reduction
5470 : : described by REDUC_INFO. Emit them in the appropriate place. */
5471 : :
5472 : : static void
5473 : 443 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5474 : : stmt_vec_info reduc_info, gimple *seq)
5475 : : {
5476 : 443 : if (reduc_info->reused_accumulator)
5477 : : {
5478 : : /* When reusing an accumulator from the main loop, we only need
5479 : : initialization instructions if the main loop can be skipped.
5480 : : In that case, emit the initialization instructions at the end
5481 : : of the guard block that does the skip. */
5482 : 39 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5483 : 39 : gcc_assert (skip_edge);
5484 : 39 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5485 : 39 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5486 : : }
5487 : : else
5488 : : {
5489 : : /* The normal case: emit the initialization instructions on the
5490 : : preheader edge. */
5491 : 404 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5492 : 404 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5493 : : }
5494 : 443 : }
5495 : :
5496 : : /* Function get_initial_def_for_reduction
5497 : :
5498 : : Input:
5499 : : REDUC_INFO - the info_for_reduction
5500 : : INIT_VAL - the initial value of the reduction variable
5501 : : NEUTRAL_OP - a value that has no effect on the reduction, as per
5502 : : neutral_op_for_reduction
5503 : :
5504 : : Output:
5505 : : Return a vector variable, initialized according to the operation that
5506 : : STMT_VINFO performs. This vector will be used as the initial value
5507 : : of the vector of partial results.
5508 : :
5509 : : The value we need is a vector in which element 0 has value INIT_VAL
5510 : : and every other element has value NEUTRAL_OP. */
5511 : :
5512 : : static tree
5513 : 15741 : get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5514 : : stmt_vec_info reduc_info,
5515 : : tree init_val, tree neutral_op)
5516 : : {
5517 : 15741 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5518 : 15741 : tree scalar_type = TREE_TYPE (init_val);
5519 : 15741 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5520 : 15741 : tree init_def;
5521 : 15741 : gimple_seq stmts = NULL;
5522 : :
5523 : 15741 : gcc_assert (vectype);
5524 : :
5525 : 15741 : gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5526 : : || SCALAR_FLOAT_TYPE_P (scalar_type));
5527 : :
5528 : 15826 : gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5529 : : || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5530 : :
5531 : 15741 : if (operand_equal_p (init_val, neutral_op))
5532 : : {
5533 : : /* If both elements are equal then the vector described above is
5534 : : just a splat. */
5535 : 15720 : neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5536 : 15720 : init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5537 : : }
5538 : : else
5539 : : {
5540 : 21 : neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5541 : 21 : init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5542 : 21 : if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5543 : : {
5544 : : /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5545 : : element 0. */
5546 : : init_def = gimple_build_vector_from_val (&stmts, vectype,
5547 : : neutral_op);
5548 : : init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5549 : : vectype, init_def, init_val);
5550 : : }
5551 : : else
5552 : : {
5553 : : /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5554 : 21 : tree_vector_builder elts (vectype, 1, 2);
5555 : 21 : elts.quick_push (init_val);
5556 : 21 : elts.quick_push (neutral_op);
5557 : 21 : init_def = gimple_build_vector (&stmts, &elts);
5558 : 21 : }
5559 : : }
5560 : :
5561 : 15741 : if (stmts)
5562 : 248 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5563 : 15741 : return init_def;
5564 : : }
5565 : :
5566 : : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5567 : : which performs a reduction involving GROUP_SIZE scalar statements.
5568 : : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5569 : : is nonnull, introducing extra elements of that value will not change the
5570 : : result. */
5571 : :
5572 : : static void
5573 : 632 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5574 : : stmt_vec_info reduc_info,
5575 : : vec<tree> *vec_oprnds,
5576 : : unsigned int number_of_vectors,
5577 : : unsigned int group_size, tree neutral_op)
5578 : : {
5579 : 632 : vec<tree> &initial_values = reduc_info->reduc_initial_values;
5580 : 632 : unsigned HOST_WIDE_INT nunits;
5581 : 632 : unsigned j, number_of_places_left_in_vector;
5582 : 632 : tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5583 : 632 : unsigned int i;
5584 : :
5585 : 1264 : gcc_assert (group_size == initial_values.length () || neutral_op);
5586 : :
5587 : : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5588 : : created vectors. It is greater than 1 if unrolling is performed.
5589 : :
5590 : : For example, we have two scalar operands, s1 and s2 (e.g., group of
5591 : : strided accesses of size two), while NUNITS is four (i.e., four scalars
5592 : : of this type can be packed in a vector). The output vector will contain
5593 : : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5594 : : will be 2).
5595 : :
5596 : : If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5597 : : vectors containing the operands.
5598 : :
5599 : : For example, NUNITS is four as before, and the group size is 8
5600 : : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5601 : : {s5, s6, s7, s8}. */
5602 : :
5603 : 632 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5604 : : nunits = group_size;
5605 : :
5606 : 632 : number_of_places_left_in_vector = nunits;
5607 : 632 : bool constant_p = true;
5608 : 632 : tree_vector_builder elts (vector_type, nunits, 1);
5609 : 632 : elts.quick_grow (nunits);
5610 : 632 : gimple_seq ctor_seq = NULL;
5611 : 4796 : for (j = 0; j < nunits * number_of_vectors; ++j)
5612 : : {
5613 : 4164 : tree op;
5614 : 4164 : i = j % group_size;
5615 : :
5616 : : /* Get the def before the loop. In reduction chain we have only
5617 : : one initial value. Else we have as many as PHIs in the group. */
5618 : 4164 : if (i >= initial_values.length () || (j > i && neutral_op))
5619 : : op = neutral_op;
5620 : : else
5621 : 1350 : op = initial_values[i];
5622 : :
5623 : : /* Create 'vect_ = {op0,op1,...,opn}'. */
5624 : 4164 : number_of_places_left_in_vector--;
5625 : 4164 : elts[nunits - number_of_places_left_in_vector - 1] = op;
5626 : 4164 : if (!CONSTANT_CLASS_P (op))
5627 : 434 : constant_p = false;
5628 : :
5629 : 4164 : if (number_of_places_left_in_vector == 0)
5630 : : {
5631 : 1013 : tree init;
5632 : 2026 : if (constant_p && !neutral_op
5633 : 2026 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5634 : 1013 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5635 : : /* Build the vector directly from ELTS. */
5636 : 1013 : init = gimple_build_vector (&ctor_seq, &elts);
5637 : 0 : else if (neutral_op)
5638 : : {
5639 : : /* Build a vector of the neutral value and shift the
5640 : : other elements into place. */
5641 : 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5642 : : neutral_op);
5643 : 0 : int k = nunits;
5644 : 0 : while (k > 0 && elts[k - 1] == neutral_op)
5645 : : k -= 1;
5646 : 0 : while (k > 0)
5647 : : {
5648 : 0 : k -= 1;
5649 : 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5650 : 0 : vector_type, init, elts[k]);
5651 : : }
5652 : : }
5653 : : else
5654 : : {
5655 : : /* First time round, duplicate ELTS to fill the
5656 : : required number of vectors. */
5657 : 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5658 : : elts, number_of_vectors, *vec_oprnds);
5659 : 0 : break;
5660 : : }
5661 : 1013 : vec_oprnds->quick_push (init);
5662 : :
5663 : 1013 : number_of_places_left_in_vector = nunits;
5664 : 1013 : elts.new_vector (vector_type, nunits, 1);
5665 : 1013 : elts.quick_grow (nunits);
5666 : 1013 : constant_p = true;
5667 : : }
5668 : : }
5669 : 632 : if (ctor_seq != NULL)
5670 : 195 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5671 : 632 : }
5672 : :
5673 : : /* For a statement STMT_INFO taking part in a reduction operation return
5674 : : the stmt_vec_info the meta information is stored on. */
5675 : :
5676 : : stmt_vec_info
5677 : 114544 : info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5678 : : {
5679 : 114544 : stmt_info = vect_orig_stmt (stmt_info);
5680 : 114544 : gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5681 : 114544 : if (!is_a <gphi *> (stmt_info->stmt)
5682 : 114544 : || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5683 : 41212 : stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5684 : 114544 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
5685 : 114544 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5686 : : {
5687 : 371 : if (gimple_phi_num_args (phi) == 1)
5688 : 110 : stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5689 : : }
5690 : 114173 : else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5691 : : {
5692 : 1887 : stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5693 : 1887 : if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5694 : 114544 : stmt_info = info;
5695 : : }
5696 : 114544 : return stmt_info;
5697 : : }
5698 : :
5699 : : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5700 : : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5701 : : return false. */
5702 : :
5703 : : static bool
5704 : 16328 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5705 : : stmt_vec_info reduc_info)
5706 : : {
5707 : 16328 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5708 : 16328 : if (!main_loop_vinfo)
5709 : : return false;
5710 : :
5711 : 4445 : if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5712 : : return false;
5713 : :
5714 : 4445 : unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5715 : 4445 : auto_vec<tree, 16> main_loop_results (num_phis);
5716 : 4445 : auto_vec<tree, 16> initial_values (num_phis);
5717 : 4445 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5718 : : {
5719 : : /* The epilogue loop can be entered either from the main loop or
5720 : : from an earlier guard block. */
5721 : 4271 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5722 : 17220 : for (tree incoming_value : reduc_info->reduc_initial_values)
5723 : : {
5724 : : /* Look for:
5725 : :
5726 : : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5727 : : INITIAL_VALUE(guard block)>. */
5728 : 4407 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5729 : :
5730 : 4407 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5731 : 4407 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5732 : :
5733 : 4407 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5734 : 4407 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5735 : :
5736 : 4407 : main_loop_results.quick_push (from_main_loop);
5737 : 4407 : initial_values.quick_push (from_skip);
5738 : : }
5739 : : }
5740 : : else
5741 : : /* The main loop dominates the epilogue loop. */
5742 : 174 : main_loop_results.splice (reduc_info->reduc_initial_values);
5743 : :
5744 : : /* See if the main loop has the kind of accumulator we need. */
5745 : 4445 : vect_reusable_accumulator *accumulator
5746 : 4445 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5747 : 4445 : if (!accumulator
5748 : 8876 : || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5749 : 13321 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5750 : : accumulator->reduc_info->reduc_scalar_results.begin ()))
5751 : : return false;
5752 : :
5753 : : /* Handle the case where we can reduce wider vectors to narrower ones. */
5754 : 4434 : tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5755 : 4434 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5756 : 4434 : unsigned HOST_WIDE_INT m;
5757 : 4434 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5758 : 4434 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5759 : 0 : return false;
5760 : : /* Check the intermediate vector types and operations are available. */
5761 : 4434 : tree prev_vectype = old_vectype;
5762 : 4434 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5763 : 12896 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5764 : : {
5765 : 4444 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5766 : 4444 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5767 : 4444 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5768 : 4444 : if (!intermediate_vectype
5769 : 4444 : || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5770 : : intermediate_vectype)
5771 : 8472 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5772 : 4028 : TYPE_MODE (intermediate_vectype)))
5773 : : return false;
5774 : : prev_vectype = intermediate_vectype;
5775 : : }
5776 : :
5777 : : /* Non-SLP reductions might apply an adjustment after the reduction
5778 : : operation, in order to simplify the initialization of the accumulator.
5779 : : If the epilogue loop carries on from where the main loop left off,
5780 : : it should apply the same adjustment to the final reduction result.
5781 : :
5782 : : If the epilogue loop can also be entered directly (rather than via
5783 : : the main loop), we need to be able to handle that case in the same way,
5784 : : with the same adjustment. (In principle we could add a PHI node
5785 : : to select the correct adjustment, but in practice that shouldn't be
5786 : : necessary.) */
5787 : 4018 : tree main_adjustment
5788 : 4018 : = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5789 : 4018 : if (loop_vinfo->main_loop_edge && main_adjustment)
5790 : : {
5791 : 3489 : gcc_assert (num_phis == 1);
5792 : 3489 : tree initial_value = initial_values[0];
5793 : : /* Check that we can use INITIAL_VALUE as the adjustment and
5794 : : initialize the accumulator with a neutral value instead. */
5795 : 3489 : if (!operand_equal_p (initial_value, main_adjustment))
5796 : 0 : return false;
5797 : 3489 : code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5798 : 3489 : initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5799 : : code, initial_value);
5800 : : }
5801 : 4018 : STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5802 : 4018 : reduc_info->reduc_initial_values.truncate (0);
5803 : 4018 : reduc_info->reduc_initial_values.splice (initial_values);
5804 : 4018 : reduc_info->reused_accumulator = accumulator;
5805 : 4018 : return true;
5806 : 4445 : }
5807 : :
5808 : : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5809 : : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5810 : :
5811 : : static tree
5812 : 5916 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5813 : : gimple_seq *seq)
5814 : : {
5815 : 5916 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5816 : 5916 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5817 : 5916 : tree stype = TREE_TYPE (vectype);
5818 : 5916 : tree new_temp = vec_def;
5819 : 10167 : while (nunits > nunits1)
5820 : : {
5821 : 4251 : nunits /= 2;
5822 : 4251 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5823 : : stype, nunits);
5824 : 4251 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5825 : :
5826 : : /* The target has to make sure we support lowpart/highpart
5827 : : extraction, either via direct vector extract or through
5828 : : an integer mode punning. */
5829 : 4251 : tree dst1, dst2;
5830 : 4251 : gimple *epilog_stmt;
5831 : 4251 : if (convert_optab_handler (vec_extract_optab,
5832 : 4251 : TYPE_MODE (TREE_TYPE (new_temp)),
5833 : 4251 : TYPE_MODE (vectype1))
5834 : : != CODE_FOR_nothing)
5835 : : {
5836 : : /* Extract sub-vectors directly once vec_extract becomes
5837 : : a conversion optab. */
5838 : 2898 : dst1 = make_ssa_name (vectype1);
5839 : 2898 : epilog_stmt
5840 : 5796 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5841 : : build3 (BIT_FIELD_REF, vectype1,
5842 : 2898 : new_temp, TYPE_SIZE (vectype1),
5843 : 2898 : bitsize_int (0)));
5844 : 2898 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5845 : 2898 : dst2 = make_ssa_name (vectype1);
5846 : 2898 : epilog_stmt
5847 : 5796 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5848 : : build3 (BIT_FIELD_REF, vectype1,
5849 : 2898 : new_temp, TYPE_SIZE (vectype1),
5850 : : bitsize_int (bitsize)));
5851 : 2898 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5852 : : }
5853 : : else
5854 : : {
5855 : : /* Extract via punning to appropriately sized integer mode
5856 : : vector. */
5857 : 1353 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5858 : 1353 : tree etype = build_vector_type (eltype, 2);
5859 : 2706 : gcc_assert (convert_optab_handler (vec_extract_optab,
5860 : : TYPE_MODE (etype),
5861 : : TYPE_MODE (eltype))
5862 : : != CODE_FOR_nothing);
5863 : 1353 : tree tem = make_ssa_name (etype);
5864 : 1353 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5865 : : build1 (VIEW_CONVERT_EXPR,
5866 : : etype, new_temp));
5867 : 1353 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5868 : 1353 : new_temp = tem;
5869 : 1353 : tem = make_ssa_name (eltype);
5870 : 1353 : epilog_stmt
5871 : 2706 : = gimple_build_assign (tem, BIT_FIELD_REF,
5872 : : build3 (BIT_FIELD_REF, eltype,
5873 : 1353 : new_temp, TYPE_SIZE (eltype),
5874 : 1353 : bitsize_int (0)));
5875 : 1353 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5876 : 1353 : dst1 = make_ssa_name (vectype1);
5877 : 1353 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5878 : : build1 (VIEW_CONVERT_EXPR,
5879 : : vectype1, tem));
5880 : 1353 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5881 : 1353 : tem = make_ssa_name (eltype);
5882 : 1353 : epilog_stmt
5883 : 2706 : = gimple_build_assign (tem, BIT_FIELD_REF,
5884 : : build3 (BIT_FIELD_REF, eltype,
5885 : 1353 : new_temp, TYPE_SIZE (eltype),
5886 : : bitsize_int (bitsize)));
5887 : 1353 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5888 : 1353 : dst2 = make_ssa_name (vectype1);
5889 : 1353 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5890 : : build1 (VIEW_CONVERT_EXPR,
5891 : : vectype1, tem));
5892 : 1353 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5893 : : }
5894 : :
5895 : 4251 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5896 : : }
5897 : :
5898 : 5916 : return new_temp;
5899 : : }
5900 : :
5901 : : /* Function vect_create_epilog_for_reduction
5902 : :
5903 : : Create code at the loop-epilog to finalize the result of a reduction
5904 : : computation.
5905 : :
5906 : : STMT_INFO is the scalar reduction stmt that is being vectorized.
5907 : : SLP_NODE is an SLP node containing a group of reduction statements. The
5908 : : first one in this group is STMT_INFO.
5909 : : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5910 : : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5911 : : (counting from 0)
5912 : : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5913 : : exit this edge is always the main loop exit.
5914 : :
5915 : : This function:
5916 : : 1. Completes the reduction def-use cycles.
5917 : : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5918 : : by calling the function specified by REDUC_FN if available, or by
5919 : : other means (whole-vector shifts or a scalar loop).
5920 : : The function also creates a new phi node at the loop exit to preserve
5921 : : loop-closed form, as illustrated below.
5922 : :
5923 : : The flow at the entry to this function:
5924 : :
5925 : : loop:
5926 : : vec_def = phi <vec_init, null> # REDUCTION_PHI
5927 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5928 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5929 : : loop_exit:
5930 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5931 : : use <s_out0>
5932 : : use <s_out0>
5933 : :
5934 : : The above is transformed by this function into:
5935 : :
5936 : : loop:
5937 : : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5938 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5939 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5940 : : loop_exit:
5941 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5942 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5943 : : v_out2 = reduce <v_out1>
5944 : : s_out3 = extract_field <v_out2, 0>
5945 : : s_out4 = adjust_result <s_out3>
5946 : : use <s_out4>
5947 : : use <s_out4>
5948 : : */
5949 : :
5950 : : static void
5951 : 16669 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5952 : : stmt_vec_info stmt_info,
5953 : : slp_tree slp_node,
5954 : : slp_instance slp_node_instance,
5955 : : edge loop_exit)
5956 : : {
5957 : 16669 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5958 : 16669 : gcc_assert (reduc_info->is_reduc_info);
5959 : : /* For double reductions we need to get at the inner loop reduction
5960 : : stmt which has the meta info attached. Our stmt_info is that of the
5961 : : loop-closed PHI of the inner loop which we remember as
5962 : : def for the reduction PHI generation. */
5963 : 16669 : bool double_reduc = false;
5964 : 16669 : stmt_vec_info rdef_info = stmt_info;
5965 : 16669 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5966 : : {
5967 : 55 : gcc_assert (!slp_node);
5968 : 55 : double_reduc = true;
5969 : 55 : stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5970 : 55 : (stmt_info->stmt, 0));
5971 : 55 : stmt_info = vect_stmt_to_vectorize (stmt_info);
5972 : : }
5973 : 16669 : code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5974 : 16669 : internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5975 : 16669 : tree vectype;
5976 : 16669 : machine_mode mode;
5977 : 16669 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5978 : 16669 : basic_block exit_bb;
5979 : 16669 : tree scalar_dest;
5980 : 16669 : tree scalar_type;
5981 : 16669 : gimple *new_phi = NULL, *phi = NULL;
5982 : 16669 : gimple_stmt_iterator exit_gsi;
5983 : 16669 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5984 : 16669 : gimple *epilog_stmt = NULL;
5985 : 16669 : gimple *exit_phi;
5986 : 16669 : tree bitsize;
5987 : 16669 : tree def;
5988 : 16669 : tree orig_name, scalar_result;
5989 : 16669 : imm_use_iterator imm_iter, phi_imm_iter;
5990 : 16669 : use_operand_p use_p, phi_use_p;
5991 : 16669 : gimple *use_stmt;
5992 : 16669 : auto_vec<tree> reduc_inputs;
5993 : 16669 : int j, i;
5994 : 16669 : vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5995 : 16669 : unsigned int group_size = 1, k;
5996 : : /* SLP reduction without reduction chain, e.g.,
5997 : : # a1 = phi <a2, a0>
5998 : : # b1 = phi <b2, b0>
5999 : : a2 = operation (a1)
6000 : : b2 = operation (b1) */
6001 : 16669 : bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6002 : 659 : bool direct_slp_reduc;
6003 : 17328 : tree induction_index = NULL_TREE;
6004 : :
6005 : 659 : if (slp_node)
6006 : 659 : group_size = SLP_TREE_LANES (slp_node);
6007 : :
6008 : 16669 : if (nested_in_vect_loop_p (loop, stmt_info))
6009 : : {
6010 : 55 : outer_loop = loop;
6011 : 55 : loop = loop->inner;
6012 : 55 : gcc_assert (!slp_node && double_reduc);
6013 : : }
6014 : :
6015 : 16669 : vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6016 : 16669 : gcc_assert (vectype);
6017 : 16669 : mode = TYPE_MODE (vectype);
6018 : :
6019 : 16669 : tree induc_val = NULL_TREE;
6020 : 16669 : tree adjustment_def = NULL;
6021 : 16669 : if (slp_node)
6022 : : ;
6023 : : else
6024 : : {
6025 : : /* Optimize: for induction condition reduction, if we can't use zero
6026 : : for induc_val, use initial_def. */
6027 : 16010 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6028 : 66 : induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6029 : 15944 : else if (double_reduc)
6030 : : ;
6031 : : else
6032 : 15889 : adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6033 : : }
6034 : :
6035 : 16669 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
6036 : 16669 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6037 : 16669 : if (slp_reduc)
6038 : : /* All statements produce live-out values. */
6039 : 1026 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6040 : :
6041 : 16669 : unsigned vec_num;
6042 : 16669 : int ncopies;
6043 : 16669 : if (slp_node)
6044 : : {
6045 : 659 : vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6046 : : ncopies = 1;
6047 : : }
6048 : : else
6049 : : {
6050 : 16010 : vec_num = 1;
6051 : 32020 : ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6052 : : }
6053 : :
6054 : : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6055 : : which is updated with the current index of the loop for every match of
6056 : : the original loop's cond_expr (VEC_STMT). This results in a vector
6057 : : containing the last time the condition passed for that vector lane.
6058 : : The first match will be a 1 to allow 0 to be used for non-matching
6059 : : indexes. If there are no matches at all then the vector will be all
6060 : : zeroes.
6061 : :
6062 : : PR92772: This algorithm is broken for architectures that support
6063 : : masked vectors, but do not provide fold_extract_last. */
6064 : 16669 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6065 : : {
6066 : 63 : auto_vec<std::pair<tree, bool>, 2> ccompares;
6067 : 63 : stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6068 : 63 : cond_info = vect_stmt_to_vectorize (cond_info);
6069 : 135 : while (cond_info != reduc_info)
6070 : : {
6071 : 72 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6072 : : {
6073 : 72 : gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6074 : 72 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6075 : 72 : ccompares.safe_push
6076 : 72 : (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6077 : 72 : STMT_VINFO_REDUC_IDX (cond_info) == 2));
6078 : : }
6079 : 72 : cond_info
6080 : 72 : = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6081 : 72 : 1 + STMT_VINFO_REDUC_IDX
6082 : : (cond_info)));
6083 : 207 : cond_info = vect_stmt_to_vectorize (cond_info);
6084 : : }
6085 : 63 : gcc_assert (ccompares.length () != 0);
6086 : :
6087 : 63 : tree indx_before_incr, indx_after_incr;
6088 : 63 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6089 : 63 : int scalar_precision
6090 : 63 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6091 : 63 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6092 : 63 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
6093 : 63 : (TYPE_MODE (vectype), cr_index_scalar_type,
6094 : : TYPE_VECTOR_SUBPARTS (vectype));
6095 : :
6096 : : /* First we create a simple vector induction variable which starts
6097 : : with the values {1,2,3,...} (SERIES_VECT) and increments by the
6098 : : vector size (STEP). */
6099 : :
6100 : : /* Create a {1,2,3,...} vector. */
6101 : 63 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6102 : :
6103 : : /* Create a vector of the step value. */
6104 : 63 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6105 : 63 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6106 : :
6107 : : /* Create an induction variable. */
6108 : 63 : gimple_stmt_iterator incr_gsi;
6109 : 63 : bool insert_after;
6110 : 63 : vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6111 : 63 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6112 : : insert_after, &indx_before_incr, &indx_after_incr);
6113 : :
6114 : : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6115 : : filled with zeros (VEC_ZERO). */
6116 : :
6117 : : /* Create a vector of 0s. */
6118 : 63 : tree zero = build_zero_cst (cr_index_scalar_type);
6119 : 63 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6120 : :
6121 : : /* Create a vector phi node. */
6122 : 63 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6123 : 63 : new_phi = create_phi_node (new_phi_tree, loop->header);
6124 : 63 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6125 : : loop_preheader_edge (loop), UNKNOWN_LOCATION);
6126 : :
6127 : : /* Now take the condition from the loops original cond_exprs
6128 : : and produce a new cond_exprs (INDEX_COND_EXPR) which for
6129 : : every match uses values from the induction variable
6130 : : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6131 : : (NEW_PHI_TREE).
6132 : : Finally, we update the phi (NEW_PHI_TREE) to take the value of
6133 : : the new cond_expr (INDEX_COND_EXPR). */
6134 : 63 : gimple_seq stmts = NULL;
6135 : 198 : for (int i = ccompares.length () - 1; i != -1; --i)
6136 : : {
6137 : 72 : tree ccompare = ccompares[i].first;
6138 : 72 : if (ccompares[i].second)
6139 : 65 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6140 : : cr_index_vector_type,
6141 : : ccompare,
6142 : : indx_before_incr, new_phi_tree);
6143 : : else
6144 : 7 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6145 : : cr_index_vector_type,
6146 : : ccompare,
6147 : : new_phi_tree, indx_before_incr);
6148 : : }
6149 : 63 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6150 : :
6151 : : /* Update the phi with the vec cond. */
6152 : 63 : induction_index = new_phi_tree;
6153 : 63 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6154 : : loop_latch_edge (loop), UNKNOWN_LOCATION);
6155 : 63 : }
6156 : :
6157 : : /* 2. Create epilog code.
6158 : : The reduction epilog code operates across the elements of the vector
6159 : : of partial results computed by the vectorized loop.
6160 : : The reduction epilog code consists of:
6161 : :
6162 : : step 1: compute the scalar result in a vector (v_out2)
6163 : : step 2: extract the scalar result (s_out3) from the vector (v_out2)
6164 : : step 3: adjust the scalar result (s_out3) if needed.
6165 : :
6166 : : Step 1 can be accomplished using one the following three schemes:
6167 : : (scheme 1) using reduc_fn, if available.
6168 : : (scheme 2) using whole-vector shifts, if available.
6169 : : (scheme 3) using a scalar loop. In this case steps 1+2 above are
6170 : : combined.
6171 : :
6172 : : The overall epilog code looks like this:
6173 : :
6174 : : s_out0 = phi <s_loop> # original EXIT_PHI
6175 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6176 : : v_out2 = reduce <v_out1> # step 1
6177 : : s_out3 = extract_field <v_out2, 0> # step 2
6178 : : s_out4 = adjust_result <s_out3> # step 3
6179 : :
6180 : : (step 3 is optional, and steps 1 and 2 may be combined).
6181 : : Lastly, the uses of s_out0 are replaced by s_out4. */
6182 : :
6183 : :
6184 : : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6185 : : v_out1 = phi <VECT_DEF>
6186 : : Store them in NEW_PHIS. */
6187 : 16669 : if (double_reduc)
6188 : 55 : loop = outer_loop;
6189 : : /* We need to reduce values in all exits. */
6190 : 16669 : exit_bb = loop_exit->dest;
6191 : 16669 : exit_gsi = gsi_after_labels (exit_bb);
6192 : 16669 : reduc_inputs.create (slp_node ? vec_num : ncopies);
6193 : 33721 : for (unsigned i = 0; i < vec_num; i++)
6194 : : {
6195 : 17052 : gimple_seq stmts = NULL;
6196 : 17052 : if (slp_node)
6197 : 1042 : def = vect_get_slp_vect_def (slp_node, i);
6198 : : else
6199 : 16010 : def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6200 : 34159 : for (j = 0; j < ncopies; j++)
6201 : : {
6202 : 17107 : tree new_def = copy_ssa_name (def);
6203 : 17107 : phi = create_phi_node (new_def, exit_bb);
6204 : 17107 : if (j)
6205 : 55 : def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6206 : 17107 : if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6207 : 17091 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6208 : : else
6209 : : {
6210 : 34 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6211 : 18 : SET_PHI_ARG_DEF (phi, k, def);
6212 : : }
6213 : 17107 : new_def = gimple_convert (&stmts, vectype, new_def);
6214 : 17107 : reduc_inputs.quick_push (new_def);
6215 : : }
6216 : 17052 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6217 : : }
6218 : :
6219 : : /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6220 : : (i.e. when reduc_fn is not available) and in the final adjustment
6221 : : code (if needed). Also get the original scalar reduction variable as
6222 : : defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6223 : : represents a reduction pattern), the tree-code and scalar-def are
6224 : : taken from the original stmt that the pattern-stmt (STMT) replaces.
6225 : : Otherwise (it is a regular reduction) - the tree-code and scalar-def
6226 : : are taken from STMT. */
6227 : :
6228 : 16669 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6229 : 16669 : if (orig_stmt_info != stmt_info)
6230 : : {
6231 : : /* Reduction pattern */
6232 : 499 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6233 : 499 : gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6234 : : }
6235 : :
6236 : 16669 : scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6237 : 16669 : scalar_type = TREE_TYPE (scalar_dest);
6238 : 16669 : scalar_results.truncate (0);
6239 : 16669 : scalar_results.reserve_exact (group_size);
6240 : 16669 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6241 : 16669 : bitsize = TYPE_SIZE (scalar_type);
6242 : :
6243 : : /* True if we should implement SLP_REDUC using native reduction operations
6244 : : instead of scalar operations. */
6245 : 33338 : direct_slp_reduc = (reduc_fn != IFN_LAST
6246 : 16669 : && slp_reduc
6247 : 16669 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6248 : :
6249 : : /* In case of reduction chain, e.g.,
6250 : : # a1 = phi <a3, a0>
6251 : : a2 = operation (a1)
6252 : : a3 = operation (a2),
6253 : :
6254 : : we may end up with more than one vector result. Here we reduce them
6255 : : to one vector.
6256 : :
6257 : : The same is true for a SLP reduction, e.g.,
6258 : : # a1 = phi <a2, a0>
6259 : : # b1 = phi <b2, b0>
6260 : : a2 = operation (a1)
6261 : : b2 = operation (a2),
6262 : :
6263 : : where we can end up with more than one vector as well. We can
6264 : : easily accumulate vectors when the number of vector elements is
6265 : : a multiple of the SLP group size.
6266 : :
6267 : : The same is true if we couldn't use a single defuse cycle. */
6268 : 16669 : if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6269 : : || direct_slp_reduc
6270 : 16523 : || (slp_reduc
6271 : 16513 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6272 : 32712 : || ncopies > 1)
6273 : : {
6274 : 669 : gimple_seq stmts = NULL;
6275 : 669 : tree single_input = reduc_inputs[0];
6276 : 1032 : for (k = 1; k < reduc_inputs.length (); k++)
6277 : 726 : single_input = gimple_build (&stmts, code, vectype,
6278 : 363 : single_input, reduc_inputs[k]);
6279 : 669 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6280 : :
6281 : 669 : reduc_inputs.truncate (0);
6282 : 669 : reduc_inputs.safe_push (single_input);
6283 : : }
6284 : :
6285 : 16669 : tree orig_reduc_input = reduc_inputs[0];
6286 : :
6287 : : /* If this loop is an epilogue loop that can be skipped after the
6288 : : main loop, we can only share a reduction operation between the
6289 : : main loop and the epilogue if we put it at the target of the
6290 : : skip edge.
6291 : :
6292 : : We can still reuse accumulators if this check fails. Doing so has
6293 : : the minor(?) benefit of making the epilogue loop's scalar result
6294 : : independent of the main loop's scalar result. */
6295 : 16669 : bool unify_with_main_loop_p = false;
6296 : 16669 : if (reduc_info->reused_accumulator
6297 : 4018 : && loop_vinfo->skip_this_loop_edge
6298 : 3831 : && single_succ_p (exit_bb)
6299 : 16726 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6300 : : {
6301 : 57 : unify_with_main_loop_p = true;
6302 : :
6303 : 57 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6304 : 57 : reduc_inputs[0] = make_ssa_name (vectype);
6305 : 57 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6306 : 57 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6307 : : UNKNOWN_LOCATION);
6308 : 57 : add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6309 : : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6310 : 57 : exit_gsi = gsi_after_labels (reduc_block);
6311 : : }
6312 : :
6313 : : /* Shouldn't be used beyond this point. */
6314 : 16669 : exit_bb = nullptr;
6315 : :
6316 : 16669 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6317 : 63 : && reduc_fn != IFN_LAST)
6318 : : {
6319 : : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6320 : : various data values where the condition matched and another vector
6321 : : (INDUCTION_INDEX) containing all the indexes of those matches. We
6322 : : need to extract the last matching index (which will be the index with
6323 : : highest value) and use this to index into the data vector.
6324 : : For the case where there were no matches, the data vector will contain
6325 : : all default values and the index vector will be all zeros. */
6326 : :
6327 : : /* Get various versions of the type of the vector of indexes. */
6328 : 4 : tree index_vec_type = TREE_TYPE (induction_index);
6329 : 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6330 : 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
6331 : 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
6332 : :
6333 : : /* Get an unsigned integer version of the type of the data vector. */
6334 : 4 : int scalar_precision
6335 : 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6336 : 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6337 : 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6338 : : vectype);
6339 : :
6340 : : /* First we need to create a vector (ZERO_VEC) of zeros and another
6341 : : vector (MAX_INDEX_VEC) filled with the last matching index, which we
6342 : : can create using a MAX reduction and then expanding.
6343 : : In the case where the loop never made any matches, the max index will
6344 : : be zero. */
6345 : :
6346 : : /* Vector of {0, 0, 0,...}. */
6347 : 4 : tree zero_vec = build_zero_cst (vectype);
6348 : :
6349 : : /* Find maximum value from the vector of found indexes. */
6350 : 4 : tree max_index = make_ssa_name (index_scalar_type);
6351 : 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6352 : : 1, induction_index);
6353 : 4 : gimple_call_set_lhs (max_index_stmt, max_index);
6354 : 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6355 : :
6356 : : /* Vector of {max_index, max_index, max_index,...}. */
6357 : 4 : tree max_index_vec = make_ssa_name (index_vec_type);
6358 : 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6359 : : max_index);
6360 : 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6361 : : max_index_vec_rhs);
6362 : 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6363 : :
6364 : : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6365 : : with the vector (INDUCTION_INDEX) of found indexes, choosing values
6366 : : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6367 : : otherwise. Only one value should match, resulting in a vector
6368 : : (VEC_COND) with one data value and the rest zeros.
6369 : : In the case where the loop never made any matches, every index will
6370 : : match, resulting in a vector with all data values (which will all be
6371 : : the default value). */
6372 : :
6373 : : /* Compare the max index vector to the vector of found indexes to find
6374 : : the position of the max value. */
6375 : 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
6376 : 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6377 : : induction_index,
6378 : : max_index_vec);
6379 : 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6380 : :
6381 : : /* Use the compare to choose either values from the data vector or
6382 : : zero. */
6383 : 4 : tree vec_cond = make_ssa_name (vectype);
6384 : 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6385 : : vec_compare,
6386 : 4 : reduc_inputs[0],
6387 : : zero_vec);
6388 : 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6389 : :
6390 : : /* Finally we need to extract the data value from the vector (VEC_COND)
6391 : : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6392 : : reduction, but because this doesn't exist, we can use a MAX reduction
6393 : : instead. The data value might be signed or a float so we need to cast
6394 : : it first.
6395 : : In the case where the loop never made any matches, the data values are
6396 : : all identical, and so will reduce down correctly. */
6397 : :
6398 : : /* Make the matched data values unsigned. */
6399 : 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6400 : 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6401 : : vec_cond);
6402 : 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6403 : : VIEW_CONVERT_EXPR,
6404 : : vec_cond_cast_rhs);
6405 : 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6406 : :
6407 : : /* Reduce down to a scalar value. */
6408 : 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
6409 : 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6410 : : 1, vec_cond_cast);
6411 : 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6412 : 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6413 : :
6414 : : /* Convert the reduced value back to the result type and set as the
6415 : : result. */
6416 : 4 : gimple_seq stmts = NULL;
6417 : 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6418 : : data_reduc);
6419 : 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6420 : 4 : scalar_results.safe_push (new_temp);
6421 : 4 : }
6422 : 16665 : else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6423 : 59 : && reduc_fn == IFN_LAST)
6424 : : {
6425 : : /* Condition reduction without supported IFN_REDUC_MAX. Generate
6426 : : idx = 0;
6427 : : idx_val = induction_index[0];
6428 : : val = data_reduc[0];
6429 : : for (idx = 0, val = init, i = 0; i < nelts; ++i)
6430 : : if (induction_index[i] > idx_val)
6431 : : val = data_reduc[i], idx_val = induction_index[i];
6432 : : return val; */
6433 : :
6434 : 59 : tree data_eltype = TREE_TYPE (vectype);
6435 : 59 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6436 : 59 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6437 : 59 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6438 : : /* Enforced by vectorizable_reduction, which ensures we have target
6439 : : support before allowing a conditional reduction on variable-length
6440 : : vectors. */
6441 : 59 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6442 : 59 : tree idx_val = NULL_TREE, val = NULL_TREE;
6443 : 399 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6444 : : {
6445 : 340 : tree old_idx_val = idx_val;
6446 : 340 : tree old_val = val;
6447 : 340 : idx_val = make_ssa_name (idx_eltype);
6448 : 340 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6449 : : build3 (BIT_FIELD_REF, idx_eltype,
6450 : : induction_index,
6451 : : bitsize_int (el_size),
6452 : : bitsize_int (off)));
6453 : 340 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6454 : 340 : val = make_ssa_name (data_eltype);
6455 : 680 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6456 : : build3 (BIT_FIELD_REF,
6457 : : data_eltype,
6458 : 340 : reduc_inputs[0],
6459 : : bitsize_int (el_size),
6460 : : bitsize_int (off)));
6461 : 340 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6462 : 340 : if (off != 0)
6463 : : {
6464 : 281 : tree new_idx_val = idx_val;
6465 : 281 : if (off != v_size - el_size)
6466 : : {
6467 : 222 : new_idx_val = make_ssa_name (idx_eltype);
6468 : 222 : epilog_stmt = gimple_build_assign (new_idx_val,
6469 : : MAX_EXPR, idx_val,
6470 : : old_idx_val);
6471 : 222 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6472 : : }
6473 : 281 : tree cond = make_ssa_name (boolean_type_node);
6474 : 281 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6475 : : idx_val, old_idx_val);
6476 : 281 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6477 : 281 : tree new_val = make_ssa_name (data_eltype);
6478 : 281 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6479 : : cond, val, old_val);
6480 : 281 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6481 : 281 : idx_val = new_idx_val;
6482 : 281 : val = new_val;
6483 : : }
6484 : : }
6485 : : /* Convert the reduced value back to the result type and set as the
6486 : : result. */
6487 : 59 : gimple_seq stmts = NULL;
6488 : 59 : val = gimple_convert (&stmts, scalar_type, val);
6489 : 59 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6490 : 59 : scalar_results.safe_push (val);
6491 : 59 : }
6492 : :
6493 : : /* 2.3 Create the reduction code, using one of the three schemes described
6494 : : above. In SLP we simply need to extract all the elements from the
6495 : : vector (without reducing them), so we use scalar shifts. */
6496 : 16606 : else if (reduc_fn != IFN_LAST && !slp_reduc)
6497 : : {
6498 : 14708 : tree tmp;
6499 : 14708 : tree vec_elem_type;
6500 : :
6501 : : /* Case 1: Create:
6502 : : v_out2 = reduc_expr <v_out1> */
6503 : :
6504 : 14708 : if (dump_enabled_p ())
6505 : 1092 : dump_printf_loc (MSG_NOTE, vect_location,
6506 : : "Reduce using direct vector reduction.\n");
6507 : :
6508 : 14708 : gimple_seq stmts = NULL;
6509 : 14708 : vec_elem_type = TREE_TYPE (vectype);
6510 : 14708 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6511 : 14708 : vec_elem_type, reduc_inputs[0]);
6512 : 14708 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6513 : 14708 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6514 : :
6515 : 14708 : if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6516 : 66 : && induc_val)
6517 : : {
6518 : : /* Earlier we set the initial value to be a vector if induc_val
6519 : : values. Check the result and if it is induc_val then replace
6520 : : with the original initial value, unless induc_val is
6521 : : the same as initial_def already. */
6522 : 63 : tree zcompare = make_ssa_name (boolean_type_node);
6523 : 63 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6524 : : new_temp, induc_val);
6525 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6526 : 63 : tree initial_def = reduc_info->reduc_initial_values[0];
6527 : 63 : tmp = make_ssa_name (new_scalar_dest);
6528 : 63 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6529 : : initial_def, new_temp);
6530 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6531 : 63 : new_temp = tmp;
6532 : : }
6533 : :
6534 : 14708 : scalar_results.safe_push (new_temp);
6535 : : }
6536 : 1898 : else if (direct_slp_reduc)
6537 : : {
6538 : : /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6539 : : with the elements for other SLP statements replaced with the
6540 : : neutral value. We can then do a normal reduction on each vector. */
6541 : :
6542 : : /* Enforced by vectorizable_reduction. */
6543 : : gcc_assert (reduc_inputs.length () == 1);
6544 : : gcc_assert (pow2p_hwi (group_size));
6545 : :
6546 : : gimple_seq seq = NULL;
6547 : :
6548 : : /* Build a vector {0, 1, 2, ...}, with the same number of elements
6549 : : and the same element size as VECTYPE. */
6550 : : tree index = build_index_vector (vectype, 0, 1);
6551 : : tree index_type = TREE_TYPE (index);
6552 : : tree index_elt_type = TREE_TYPE (index_type);
6553 : : tree mask_type = truth_type_for (index_type);
6554 : :
6555 : : /* Create a vector that, for each element, identifies which of
6556 : : the REDUC_GROUP_SIZE results should use it. */
6557 : : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6558 : : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6559 : : build_vector_from_val (index_type, index_mask));
6560 : :
6561 : : /* Get a neutral vector value. This is simply a splat of the neutral
6562 : : scalar value if we have one, otherwise the initial scalar value
6563 : : is itself a neutral value. */
6564 : : tree vector_identity = NULL_TREE;
6565 : : tree neutral_op = NULL_TREE;
6566 : : if (slp_node)
6567 : : {
6568 : : tree initial_value = NULL_TREE;
6569 : : if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6570 : : initial_value = reduc_info->reduc_initial_values[0];
6571 : : neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6572 : : initial_value, false);
6573 : : }
6574 : : if (neutral_op)
6575 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
6576 : : neutral_op);
6577 : : for (unsigned int i = 0; i < group_size; ++i)
6578 : : {
6579 : : /* If there's no univeral neutral value, we can use the
6580 : : initial scalar value from the original PHI. This is used
6581 : : for MIN and MAX reduction, for example. */
6582 : : if (!neutral_op)
6583 : : {
6584 : : tree scalar_value = reduc_info->reduc_initial_values[i];
6585 : : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6586 : : scalar_value);
6587 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
6588 : : scalar_value);
6589 : : }
6590 : :
6591 : : /* Calculate the equivalent of:
6592 : :
6593 : : sel[j] = (index[j] == i);
6594 : :
6595 : : which selects the elements of REDUC_INPUTS[0] that should
6596 : : be included in the result. */
6597 : : tree compare_val = build_int_cst (index_elt_type, i);
6598 : : compare_val = build_vector_from_val (index_type, compare_val);
6599 : : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6600 : : index, compare_val);
6601 : :
6602 : : /* Calculate the equivalent of:
6603 : :
6604 : : vec = seq ? reduc_inputs[0] : vector_identity;
6605 : :
6606 : : VEC is now suitable for a full vector reduction. */
6607 : : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6608 : : sel, reduc_inputs[0], vector_identity);
6609 : :
6610 : : /* Do the reduction and convert it to the appropriate type. */
6611 : : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6612 : : TREE_TYPE (vectype), vec);
6613 : : scalar = gimple_convert (&seq, scalar_type, scalar);
6614 : : scalar_results.safe_push (scalar);
6615 : : }
6616 : : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6617 : : }
6618 : : else
6619 : : {
6620 : 1898 : bool reduce_with_shift;
6621 : 1898 : tree vec_temp;
6622 : :
6623 : 1898 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6624 : :
6625 : : /* See if the target wants to do the final (shift) reduction
6626 : : in a vector mode of smaller size and first reduce upper/lower
6627 : : halves against each other. */
6628 : 1898 : enum machine_mode mode1 = mode;
6629 : 1898 : tree stype = TREE_TYPE (vectype);
6630 : 1898 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6631 : 1898 : unsigned nunits1 = nunits;
6632 : 1898 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6633 : 1898 : && reduc_inputs.length () == 1)
6634 : : {
6635 : 187 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6636 : : /* For SLP reductions we have to make sure lanes match up, but
6637 : : since we're doing individual element final reduction reducing
6638 : : vector width here is even more important.
6639 : : ??? We can also separate lanes with permutes, for the common
6640 : : case of power-of-two group-size odd/even extracts would work. */
6641 : 187 : if (slp_reduc && nunits != nunits1)
6642 : : {
6643 : 166 : nunits1 = least_common_multiple (nunits1, group_size);
6644 : 332 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6645 : : }
6646 : : }
6647 : 1898 : if (!slp_reduc
6648 : 1898 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6649 : 42 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6650 : :
6651 : 1898 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6652 : : stype, nunits1);
6653 : 1898 : reduce_with_shift = have_whole_vector_shift (mode1);
6654 : 937 : if (!VECTOR_MODE_P (mode1)
6655 : 2835 : || !directly_supported_p (code, vectype1))
6656 : : reduce_with_shift = false;
6657 : :
6658 : : /* First reduce the vector to the desired vector size we should
6659 : : do shift reduction on by combining upper and lower halves. */
6660 : 1898 : gimple_seq stmts = NULL;
6661 : 1898 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6662 : : code, &stmts);
6663 : 1898 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6664 : 1898 : reduc_inputs[0] = new_temp;
6665 : :
6666 : 1898 : if (reduce_with_shift && !slp_reduc)
6667 : : {
6668 : 1361 : int element_bitsize = tree_to_uhwi (bitsize);
6669 : : /* Enforced by vectorizable_reduction, which disallows SLP reductions
6670 : : for variable-length vectors and also requires direct target support
6671 : : for loop reductions. */
6672 : 1361 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6673 : 1361 : int nelements = vec_size_in_bits / element_bitsize;
6674 : 1361 : vec_perm_builder sel;
6675 : 1361 : vec_perm_indices indices;
6676 : :
6677 : 1361 : int elt_offset;
6678 : :
6679 : 1361 : tree zero_vec = build_zero_cst (vectype1);
6680 : : /* Case 2: Create:
6681 : : for (offset = nelements/2; offset >= 1; offset/=2)
6682 : : {
6683 : : Create: va' = vec_shift <va, offset>
6684 : : Create: va = vop <va, va'>
6685 : : } */
6686 : :
6687 : 1361 : tree rhs;
6688 : :
6689 : 1361 : if (dump_enabled_p ())
6690 : 299 : dump_printf_loc (MSG_NOTE, vect_location,
6691 : : "Reduce using vector shifts\n");
6692 : :
6693 : 1361 : gimple_seq stmts = NULL;
6694 : 1361 : new_temp = gimple_convert (&stmts, vectype1, new_temp);
6695 : 1361 : for (elt_offset = nelements / 2;
6696 : 2988 : elt_offset >= 1;
6697 : 1627 : elt_offset /= 2)
6698 : : {
6699 : 1627 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6700 : 1627 : indices.new_vector (sel, 2, nelements);
6701 : 1627 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6702 : 1627 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6703 : : new_temp, zero_vec, mask);
6704 : 1627 : new_temp = gimple_build (&stmts, code,
6705 : : vectype1, new_name, new_temp);
6706 : : }
6707 : 1361 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6708 : :
6709 : : /* 2.4 Extract the final scalar result. Create:
6710 : : s_out3 = extract_field <v_out2, bitpos> */
6711 : :
6712 : 1361 : if (dump_enabled_p ())
6713 : 299 : dump_printf_loc (MSG_NOTE, vect_location,
6714 : : "extract scalar result\n");
6715 : :
6716 : 1361 : rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6717 : : bitsize, bitsize_zero_node);
6718 : 1361 : epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6719 : 1361 : new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6720 : 1361 : gimple_assign_set_lhs (epilog_stmt, new_temp);
6721 : 1361 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6722 : 1361 : scalar_results.safe_push (new_temp);
6723 : 1361 : }
6724 : : else
6725 : : {
6726 : : /* Case 3: Create:
6727 : : s = extract_field <v_out2, 0>
6728 : : for (offset = element_size;
6729 : : offset < vector_size;
6730 : : offset += element_size;)
6731 : : {
6732 : : Create: s' = extract_field <v_out2, offset>
6733 : : Create: s = op <s, s'> // For non SLP cases
6734 : : } */
6735 : :
6736 : 537 : if (dump_enabled_p ())
6737 : 106 : dump_printf_loc (MSG_NOTE, vect_location,
6738 : : "Reduce using scalar code.\n");
6739 : :
6740 : 537 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6741 : 537 : int element_bitsize = tree_to_uhwi (bitsize);
6742 : 537 : tree compute_type = TREE_TYPE (vectype);
6743 : 537 : gimple_seq stmts = NULL;
6744 : 1149 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6745 : : {
6746 : 612 : int bit_offset;
6747 : 612 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6748 : : vec_temp, bitsize, bitsize_zero_node);
6749 : :
6750 : : /* In SLP we don't need to apply reduction operation, so we just
6751 : : collect s' values in SCALAR_RESULTS. */
6752 : 612 : if (slp_reduc)
6753 : 588 : scalar_results.safe_push (new_temp);
6754 : :
6755 : 1233 : for (bit_offset = element_bitsize;
6756 : 1845 : bit_offset < vec_size_in_bits;
6757 : 1233 : bit_offset += element_bitsize)
6758 : : {
6759 : 1233 : tree bitpos = bitsize_int (bit_offset);
6760 : 1233 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6761 : : compute_type, vec_temp,
6762 : : bitsize, bitpos);
6763 : 1233 : if (slp_reduc)
6764 : : {
6765 : : /* In SLP we don't need to apply reduction operation, so
6766 : : we just collect s' values in SCALAR_RESULTS. */
6767 : 1210 : new_temp = new_name;
6768 : 1210 : scalar_results.safe_push (new_name);
6769 : : }
6770 : : else
6771 : 23 : new_temp = gimple_build (&stmts, code, compute_type,
6772 : : new_name, new_temp);
6773 : : }
6774 : : }
6775 : :
6776 : : /* The only case where we need to reduce scalar results in SLP, is
6777 : : unrolling. If the size of SCALAR_RESULTS is greater than
6778 : : REDUC_GROUP_SIZE, we reduce them combining elements modulo
6779 : : REDUC_GROUP_SIZE. */
6780 : 537 : if (slp_reduc)
6781 : : {
6782 : 513 : tree res, first_res, new_res;
6783 : :
6784 : : /* Reduce multiple scalar results in case of SLP unrolling. */
6785 : 1095 : for (j = group_size; scalar_results.iterate (j, &res);
6786 : : j++)
6787 : : {
6788 : 582 : first_res = scalar_results[j % group_size];
6789 : 582 : new_res = gimple_build (&stmts, code, compute_type,
6790 : : first_res, res);
6791 : 582 : scalar_results[j % group_size] = new_res;
6792 : : }
6793 : 513 : scalar_results.truncate (group_size);
6794 : 2242 : for (k = 0; k < group_size; k++)
6795 : 2432 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6796 : 1216 : scalar_results[k]);
6797 : : }
6798 : : else
6799 : : {
6800 : : /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6801 : 24 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6802 : 24 : scalar_results.safe_push (new_temp);
6803 : : }
6804 : :
6805 : 537 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6806 : : }
6807 : :
6808 : 1898 : if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6809 : 0 : && induc_val)
6810 : : {
6811 : : /* Earlier we set the initial value to be a vector if induc_val
6812 : : values. Check the result and if it is induc_val then replace
6813 : : with the original initial value, unless induc_val is
6814 : : the same as initial_def already. */
6815 : 0 : tree zcompare = make_ssa_name (boolean_type_node);
6816 : 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6817 : : induc_val);
6818 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6819 : 0 : tree initial_def = reduc_info->reduc_initial_values[0];
6820 : 0 : tree tmp = make_ssa_name (new_scalar_dest);
6821 : 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6822 : : initial_def, new_temp);
6823 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6824 : 0 : scalar_results[0] = tmp;
6825 : : }
6826 : : }
6827 : :
6828 : : /* 2.5 Adjust the final result by the initial value of the reduction
6829 : : variable. (When such adjustment is not needed, then
6830 : : 'adjustment_def' is zero). For example, if code is PLUS we create:
6831 : : new_temp = loop_exit_def + adjustment_def */
6832 : :
6833 : 16669 : if (adjustment_def)
6834 : : {
6835 : 11944 : gcc_assert (!slp_reduc);
6836 : 11944 : gimple_seq stmts = NULL;
6837 : 11944 : if (double_reduc)
6838 : : {
6839 : 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6840 : 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6841 : 0 : new_temp = gimple_build (&stmts, code, vectype,
6842 : 0 : reduc_inputs[0], adjustment_def);
6843 : : }
6844 : : else
6845 : : {
6846 : 11944 : new_temp = scalar_results[0];
6847 : 11944 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6848 : 11944 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6849 : : adjustment_def);
6850 : 11944 : new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6851 : 11944 : new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6852 : : new_temp, adjustment_def);
6853 : 11944 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6854 : : }
6855 : :
6856 : 11944 : epilog_stmt = gimple_seq_last_stmt (stmts);
6857 : 11944 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6858 : 11944 : scalar_results[0] = new_temp;
6859 : : }
6860 : :
6861 : : /* Record this operation if it could be reused by the epilogue loop. */
6862 : 16669 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6863 : 16669 : && reduc_inputs.length () == 1)
6864 : 16501 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6865 : : { orig_reduc_input, reduc_info });
6866 : :
6867 : 16669 : if (double_reduc)
6868 : 55 : loop = outer_loop;
6869 : :
6870 : : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6871 : : phis with new adjusted scalar results, i.e., replace use <s_out0>
6872 : : with use <s_out4>.
6873 : :
6874 : : Transform:
6875 : : loop_exit:
6876 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6877 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6878 : : v_out2 = reduce <v_out1>
6879 : : s_out3 = extract_field <v_out2, 0>
6880 : : s_out4 = adjust_result <s_out3>
6881 : : use <s_out0>
6882 : : use <s_out0>
6883 : :
6884 : : into:
6885 : :
6886 : : loop_exit:
6887 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6888 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6889 : : v_out2 = reduce <v_out1>
6890 : : s_out3 = extract_field <v_out2, 0>
6891 : : s_out4 = adjust_result <s_out3>
6892 : : use <s_out4>
6893 : : use <s_out4> */
6894 : :
6895 : 33338 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6896 : 16669 : auto_vec<gimple *> phis;
6897 : 34041 : for (k = 0; k < live_out_stmts.size (); k++)
6898 : : {
6899 : 17372 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6900 : 17372 : scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6901 : :
6902 : : /* Find the loop-closed-use at the loop exit of the original scalar
6903 : : result. (The reduction result is expected to have two immediate uses,
6904 : : one at the latch block, and one at the loop exit). For double
6905 : : reductions we are looking for exit phis of the outer loop. */
6906 : 69164 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6907 : : {
6908 : 51792 : if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6909 : : {
6910 : 17308 : if (!is_gimple_debug (USE_STMT (use_p))
6911 : 17308 : && gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6912 : 17304 : phis.safe_push (USE_STMT (use_p));
6913 : : }
6914 : : else
6915 : : {
6916 : 34484 : if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6917 : : {
6918 : 110 : tree phi_res = PHI_RESULT (USE_STMT (use_p));
6919 : :
6920 : 277 : FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6921 : : {
6922 : 167 : if (!flow_bb_inside_loop_p (loop,
6923 : 167 : gimple_bb (USE_STMT (phi_use_p)))
6924 : 167 : && !is_gimple_debug (USE_STMT (phi_use_p)))
6925 : 55 : phis.safe_push (USE_STMT (phi_use_p));
6926 : : }
6927 : : }
6928 : : }
6929 : : }
6930 : :
6931 : 34731 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6932 : : {
6933 : : /* Replace the uses: */
6934 : 17359 : orig_name = PHI_RESULT (exit_phi);
6935 : :
6936 : : /* Look for a single use at the target of the skip edge. */
6937 : 17359 : if (unify_with_main_loop_p)
6938 : : {
6939 : 117 : use_operand_p use_p;
6940 : 117 : gimple *user;
6941 : 117 : if (!single_imm_use (orig_name, &use_p, &user))
6942 : 0 : gcc_unreachable ();
6943 : 117 : orig_name = gimple_get_lhs (user);
6944 : : }
6945 : :
6946 : 17359 : scalar_result = scalar_results[k];
6947 : 46468 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6948 : : {
6949 : 87341 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6950 : 29116 : SET_USE (use_p, scalar_result);
6951 : 29109 : update_stmt (use_stmt);
6952 : 17359 : }
6953 : : }
6954 : :
6955 : 17372 : phis.truncate (0);
6956 : : }
6957 : 16669 : }
6958 : :
6959 : : /* Return a vector of type VECTYPE that is equal to the vector select
6960 : : operation "MASK ? VEC : IDENTITY". Insert the select statements
6961 : : before GSI. */
6962 : :
6963 : : static tree
6964 : 0 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6965 : : tree vec, tree identity)
6966 : : {
6967 : 0 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6968 : 0 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6969 : : mask, vec, identity);
6970 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6971 : 0 : return cond;
6972 : : }
6973 : :
6974 : : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6975 : : order, starting with LHS. Insert the extraction statements before GSI and
6976 : : associate the new scalar SSA names with variable SCALAR_DEST.
6977 : : If MASK is nonzero mask the input and then operate on it unconditionally.
6978 : : Return the SSA name for the result. */
6979 : :
6980 : : static tree
6981 : 750 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6982 : : tree_code code, tree lhs, tree vector_rhs,
6983 : : tree mask)
6984 : : {
6985 : 750 : tree vectype = TREE_TYPE (vector_rhs);
6986 : 750 : tree scalar_type = TREE_TYPE (vectype);
6987 : 750 : tree bitsize = TYPE_SIZE (scalar_type);
6988 : 750 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6989 : 750 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6990 : :
6991 : : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6992 : : to perform an unconditional element-wise reduction of it. */
6993 : 750 : if (mask)
6994 : : {
6995 : 4 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6996 : : "masked_vector_rhs");
6997 : 4 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6998 : : false);
6999 : 4 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
7000 : 4 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7001 : : mask, vector_rhs, vector_identity);
7002 : 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
7003 : 4 : vector_rhs = masked_vector_rhs;
7004 : : }
7005 : :
7006 : 750 : for (unsigned HOST_WIDE_INT bit_offset = 0;
7007 : 3278 : bit_offset < vec_size_in_bits;
7008 : 2528 : bit_offset += element_bitsize)
7009 : : {
7010 : 2528 : tree bitpos = bitsize_int (bit_offset);
7011 : 2528 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7012 : : bitsize, bitpos);
7013 : :
7014 : 2528 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7015 : 2528 : rhs = make_ssa_name (scalar_dest, stmt);
7016 : 2528 : gimple_assign_set_lhs (stmt, rhs);
7017 : 2528 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7018 : :
7019 : 2528 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7020 : 2528 : tree new_name = make_ssa_name (scalar_dest, stmt);
7021 : 2528 : gimple_assign_set_lhs (stmt, new_name);
7022 : 2528 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7023 : 2528 : lhs = new_name;
7024 : : }
7025 : 750 : return lhs;
7026 : : }
7027 : :
7028 : : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
7029 : : type of the vector input. */
7030 : :
7031 : : static internal_fn
7032 : 647 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7033 : : {
7034 : 647 : internal_fn mask_reduc_fn;
7035 : 647 : internal_fn mask_len_reduc_fn;
7036 : :
7037 : 647 : switch (reduc_fn)
7038 : : {
7039 : 0 : case IFN_FOLD_LEFT_PLUS:
7040 : 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7041 : 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7042 : 0 : break;
7043 : :
7044 : : default:
7045 : : return IFN_LAST;
7046 : : }
7047 : :
7048 : 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7049 : : OPTIMIZE_FOR_SPEED))
7050 : : return mask_reduc_fn;
7051 : 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7052 : : OPTIMIZE_FOR_SPEED))
7053 : : return mask_len_reduc_fn;
7054 : : return IFN_LAST;
7055 : : }
7056 : :
7057 : : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7058 : : statement that sets the live-out value. REDUC_DEF_STMT is the phi
7059 : : statement. CODE is the operation performed by STMT_INFO and OPS are
7060 : : its scalar operands. REDUC_INDEX is the index of the operand in
7061 : : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7062 : : implements in-order reduction, or IFN_LAST if we should open-code it.
7063 : : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7064 : : that should be used to control the operation in a fully-masked loop. */
7065 : :
7066 : : static bool
7067 : 641 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7068 : : stmt_vec_info stmt_info,
7069 : : gimple_stmt_iterator *gsi,
7070 : : gimple **vec_stmt, slp_tree slp_node,
7071 : : gimple *reduc_def_stmt,
7072 : : code_helper code, internal_fn reduc_fn,
7073 : : tree *ops, int num_ops, tree vectype_in,
7074 : : int reduc_index, vec_loop_masks *masks,
7075 : : vec_loop_lens *lens)
7076 : : {
7077 : 641 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7078 : 641 : tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7079 : 641 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7080 : :
7081 : 641 : int ncopies;
7082 : 641 : if (slp_node)
7083 : : ncopies = 1;
7084 : : else
7085 : 592 : ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7086 : :
7087 : 641 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7088 : 641 : gcc_assert (ncopies == 1);
7089 : :
7090 : 641 : bool is_cond_op = false;
7091 : 641 : if (!code.is_tree_code ())
7092 : : {
7093 : 4 : code = conditional_internal_fn_code (internal_fn (code));
7094 : 4 : gcc_assert (code != ERROR_MARK);
7095 : : is_cond_op = true;
7096 : : }
7097 : :
7098 : 641 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7099 : :
7100 : 641 : if (slp_node)
7101 : : {
7102 : 49 : if (is_cond_op)
7103 : : {
7104 : 0 : if (dump_enabled_p ())
7105 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7106 : : "fold-left reduction on SLP not supported.\n");
7107 : 0 : return false;
7108 : : }
7109 : :
7110 : 49 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7111 : : TYPE_VECTOR_SUBPARTS (vectype_in)));
7112 : : }
7113 : :
7114 : : /* The operands either come from a binary operation or an IFN_COND operation.
7115 : : The former is a gimple assign with binary rhs and the latter is a
7116 : : gimple call with four arguments. */
7117 : 641 : gcc_assert (num_ops == 2 || num_ops == 4);
7118 : 641 : tree op0, opmask;
7119 : 641 : if (!is_cond_op)
7120 : 637 : op0 = ops[1 - reduc_index];
7121 : : else
7122 : : {
7123 : 4 : op0 = ops[2 + (1 - reduc_index)];
7124 : 4 : opmask = ops[0];
7125 : 4 : gcc_assert (!slp_node);
7126 : : }
7127 : :
7128 : 641 : int group_size = 1;
7129 : 641 : stmt_vec_info scalar_dest_def_info;
7130 : 641 : auto_vec<tree> vec_oprnds0, vec_opmask;
7131 : 641 : if (slp_node)
7132 : : {
7133 : 49 : auto_vec<vec<tree> > vec_defs (2);
7134 : 49 : vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7135 : 49 : vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7136 : 49 : vec_defs[0].release ();
7137 : 49 : vec_defs[1].release ();
7138 : 49 : group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7139 : 49 : scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7140 : 49 : }
7141 : : else
7142 : : {
7143 : 592 : vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7144 : : op0, &vec_oprnds0);
7145 : 592 : scalar_dest_def_info = stmt_info;
7146 : :
7147 : : /* For an IFN_COND_OP we also need the vector mask operand. */
7148 : 592 : if (is_cond_op)
7149 : 4 : vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7150 : : opmask, &vec_opmask);
7151 : : }
7152 : :
7153 : 641 : gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7154 : 641 : tree scalar_dest = gimple_get_lhs (sdef);
7155 : 641 : tree scalar_type = TREE_TYPE (scalar_dest);
7156 : 641 : tree reduc_var = gimple_phi_result (reduc_def_stmt);
7157 : :
7158 : 641 : int vec_num = vec_oprnds0.length ();
7159 : 641 : gcc_assert (vec_num == 1 || slp_node);
7160 : 641 : tree vec_elem_type = TREE_TYPE (vectype_out);
7161 : 641 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7162 : :
7163 : 641 : tree vector_identity = NULL_TREE;
7164 : 641 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7165 : : {
7166 : 0 : vector_identity = build_zero_cst (vectype_out);
7167 : 0 : if (!HONOR_SIGNED_ZEROS (vectype_out))
7168 : : ;
7169 : : else
7170 : : {
7171 : 0 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7172 : 0 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7173 : : vector_identity);
7174 : : }
7175 : : }
7176 : :
7177 : 641 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7178 : 641 : int i;
7179 : 641 : tree def0;
7180 : 1391 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7181 : : {
7182 : 750 : gimple *new_stmt;
7183 : 750 : tree mask = NULL_TREE;
7184 : 750 : tree len = NULL_TREE;
7185 : 750 : tree bias = NULL_TREE;
7186 : 750 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7187 : 0 : mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7188 : 750 : else if (is_cond_op)
7189 : 4 : mask = vec_opmask[0];
7190 : 750 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7191 : : {
7192 : 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7193 : : i, 1);
7194 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7195 : 0 : bias = build_int_cst (intQI_type_node, biasval);
7196 : 0 : if (!is_cond_op)
7197 : 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
7198 : : }
7199 : :
7200 : : /* Handle MINUS by adding the negative. */
7201 : 750 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7202 : : {
7203 : 0 : tree negated = make_ssa_name (vectype_out);
7204 : 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7205 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7206 : 0 : def0 = negated;
7207 : : }
7208 : :
7209 : 0 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7210 : 750 : && mask && mask_reduc_fn == IFN_LAST)
7211 : 0 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7212 : : vector_identity);
7213 : :
7214 : : /* On the first iteration the input is simply the scalar phi
7215 : : result, and for subsequent iterations it is the output of
7216 : : the preceding operation. */
7217 : 750 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7218 : : {
7219 : 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7220 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7221 : : def0, mask, len, bias);
7222 : 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7223 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7224 : : def0, mask);
7225 : : else
7226 : 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7227 : : def0);
7228 : : /* For chained SLP reductions the output of the previous reduction
7229 : : operation serves as the input of the next. For the final statement
7230 : : the output cannot be a temporary - we reuse the original
7231 : : scalar destination of the last statement. */
7232 : 0 : if (i != vec_num - 1)
7233 : : {
7234 : 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
7235 : 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7236 : 0 : gimple_set_lhs (new_stmt, reduc_var);
7237 : : }
7238 : : }
7239 : : else
7240 : : {
7241 : 750 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7242 : : tree_code (code), reduc_var, def0,
7243 : : mask);
7244 : 750 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7245 : : /* Remove the statement, so that we can use the same code paths
7246 : : as for statements that we've just created. */
7247 : 750 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7248 : 750 : gsi_remove (&tmp_gsi, true);
7249 : : }
7250 : :
7251 : 750 : if (i == vec_num - 1)
7252 : : {
7253 : 641 : gimple_set_lhs (new_stmt, scalar_dest);
7254 : 641 : vect_finish_replace_stmt (loop_vinfo,
7255 : : scalar_dest_def_info,
7256 : : new_stmt);
7257 : : }
7258 : : else
7259 : 109 : vect_finish_stmt_generation (loop_vinfo,
7260 : : scalar_dest_def_info,
7261 : : new_stmt, gsi);
7262 : :
7263 : 750 : if (slp_node)
7264 : 158 : slp_node->push_vec_def (new_stmt);
7265 : : else
7266 : : {
7267 : 592 : STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7268 : 592 : *vec_stmt = new_stmt;
7269 : : }
7270 : : }
7271 : :
7272 : 641 : return true;
7273 : 641 : }
7274 : :
7275 : : /* Function is_nonwrapping_integer_induction.
7276 : :
7277 : : Check if STMT_VINO (which is part of loop LOOP) both increments and
7278 : : does not cause overflow. */
7279 : :
7280 : : static bool
7281 : 236 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7282 : : {
7283 : 236 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7284 : 236 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7285 : 236 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7286 : 236 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7287 : 236 : widest_int ni, max_loop_value, lhs_max;
7288 : 236 : wi::overflow_type overflow = wi::OVF_NONE;
7289 : :
7290 : : /* Make sure the loop is integer based. */
7291 : 236 : if (TREE_CODE (base) != INTEGER_CST
7292 : 106 : || TREE_CODE (step) != INTEGER_CST)
7293 : : return false;
7294 : :
7295 : : /* Check that the max size of the loop will not wrap. */
7296 : :
7297 : 106 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7298 : : return true;
7299 : :
7300 : 12 : if (! max_stmt_executions (loop, &ni))
7301 : : return false;
7302 : :
7303 : 12 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7304 : 12 : &overflow);
7305 : 12 : if (overflow)
7306 : : return false;
7307 : :
7308 : 12 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7309 : 24 : TYPE_SIGN (lhs_type), &overflow);
7310 : 12 : if (overflow)
7311 : : return false;
7312 : :
7313 : 12 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7314 : 12 : <= TYPE_PRECISION (lhs_type));
7315 : 236 : }
7316 : :
7317 : : /* Check if masking can be supported by inserting a conditional expression.
7318 : : CODE is the code for the operation. COND_FN is the conditional internal
7319 : : function, if it exists. VECTYPE_IN is the type of the vector input. */
7320 : : static bool
7321 : 1650 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7322 : : tree vectype_in)
7323 : : {
7324 : 1650 : if (cond_fn != IFN_LAST
7325 : 1650 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
7326 : : OPTIMIZE_FOR_SPEED))
7327 : : return false;
7328 : :
7329 : 1620 : if (code.is_tree_code ())
7330 : 1382 : switch (tree_code (code))
7331 : : {
7332 : : case DOT_PROD_EXPR:
7333 : : case SAD_EXPR:
7334 : : return true;
7335 : :
7336 : : default:
7337 : : break;
7338 : : }
7339 : : return false;
7340 : : }
7341 : :
7342 : : /* Insert a conditional expression to enable masked vectorization. CODE is the
7343 : : code for the operation. VOP is the array of operands. MASK is the loop
7344 : : mask. GSI is a statement iterator used to place the new conditional
7345 : : expression. */
7346 : : static void
7347 : 0 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7348 : : gimple_stmt_iterator *gsi)
7349 : : {
7350 : 0 : switch (tree_code (code))
7351 : : {
7352 : 0 : case DOT_PROD_EXPR:
7353 : 0 : {
7354 : 0 : tree vectype = TREE_TYPE (vop[1]);
7355 : 0 : tree zero = build_zero_cst (vectype);
7356 : 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7357 : 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7358 : : mask, vop[1], zero);
7359 : 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
7360 : 0 : vop[1] = masked_op1;
7361 : 0 : break;
7362 : : }
7363 : :
7364 : 0 : case SAD_EXPR:
7365 : 0 : {
7366 : 0 : tree vectype = TREE_TYPE (vop[1]);
7367 : 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7368 : 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7369 : : mask, vop[1], vop[0]);
7370 : 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
7371 : 0 : vop[1] = masked_op1;
7372 : 0 : break;
7373 : : }
7374 : :
7375 : 0 : default:
7376 : 0 : gcc_unreachable ();
7377 : : }
7378 : 0 : }
7379 : :
7380 : : /* Function vectorizable_reduction.
7381 : :
7382 : : Check if STMT_INFO performs a reduction operation that can be vectorized.
7383 : : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7384 : : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7385 : : Return true if STMT_INFO is vectorizable in this way.
7386 : :
7387 : : This function also handles reduction idioms (patterns) that have been
7388 : : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7389 : : may be of this form:
7390 : : X = pattern_expr (arg0, arg1, ..., X)
7391 : : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7392 : : sequence that had been detected and replaced by the pattern-stmt
7393 : : (STMT_INFO).
7394 : :
7395 : : This function also handles reduction of condition expressions, for example:
7396 : : for (int i = 0; i < N; i++)
7397 : : if (a[i] < value)
7398 : : last = a[i];
7399 : : This is handled by vectorising the loop and creating an additional vector
7400 : : containing the loop indexes for which "a[i] < value" was true. In the
7401 : : function epilogue this is reduced to a single max value and then used to
7402 : : index into the vector of results.
7403 : :
7404 : : In some cases of reduction patterns, the type of the reduction variable X is
7405 : : different than the type of the other arguments of STMT_INFO.
7406 : : In such cases, the vectype that is used when transforming STMT_INFO into
7407 : : a vector stmt is different than the vectype that is used to determine the
7408 : : vectorization factor, because it consists of a different number of elements
7409 : : than the actual number of elements that are being operated upon in parallel.
7410 : :
7411 : : For example, consider an accumulation of shorts into an int accumulator.
7412 : : On some targets it's possible to vectorize this pattern operating on 8
7413 : : shorts at a time (hence, the vectype for purposes of determining the
7414 : : vectorization factor should be V8HI); on the other hand, the vectype that
7415 : : is used to create the vector form is actually V4SI (the type of the result).
7416 : :
7417 : : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7418 : : indicates what is the actual level of parallelism (V8HI in the example), so
7419 : : that the right vectorization factor would be derived. This vectype
7420 : : corresponds to the type of arguments to the reduction stmt, and should *NOT*
7421 : : be used to create the vectorized stmt. The right vectype for the vectorized
7422 : : stmt is obtained from the type of the result X:
7423 : : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7424 : :
7425 : : This means that, contrary to "regular" reductions (or "regular" stmts in
7426 : : general), the following equation:
7427 : : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7428 : : does *NOT* necessarily hold for reduction patterns. */
7429 : :
7430 : : bool
7431 : 164054 : vectorizable_reduction (loop_vec_info loop_vinfo,
7432 : : stmt_vec_info stmt_info, slp_tree slp_node,
7433 : : slp_instance slp_node_instance,
7434 : : stmt_vector_for_cost *cost_vec)
7435 : : {
7436 : 164054 : tree vectype_in = NULL_TREE;
7437 : 164054 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7438 : 164054 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7439 : 164054 : stmt_vec_info cond_stmt_vinfo = NULL;
7440 : 164054 : int i;
7441 : 164054 : int ncopies;
7442 : 164054 : bool single_defuse_cycle = false;
7443 : 164054 : bool nested_cycle = false;
7444 : 164054 : bool double_reduc = false;
7445 : 164054 : int vec_num;
7446 : 164054 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7447 : 164054 : tree cond_reduc_val = NULL_TREE;
7448 : :
7449 : : /* Make sure it was already recognized as a reduction computation. */
7450 : 164054 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7451 : : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7452 : 164054 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7453 : : return false;
7454 : :
7455 : : /* The stmt we store reduction analysis meta on. */
7456 : 60131 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7457 : 60131 : reduc_info->is_reduc_info = true;
7458 : :
7459 : 60131 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7460 : : {
7461 : 1213 : if (is_a <gphi *> (stmt_info->stmt))
7462 : : {
7463 : 1213 : if (slp_node)
7464 : : {
7465 : : /* We eventually need to set a vector type on invariant
7466 : : arguments. */
7467 : : unsigned j;
7468 : : slp_tree child;
7469 : 66 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7470 : 44 : if (!vect_maybe_update_slp_op_vectype
7471 : 44 : (child, SLP_TREE_VECTYPE (slp_node)))
7472 : : {
7473 : 0 : if (dump_enabled_p ())
7474 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7475 : : "incompatible vector types for "
7476 : : "invariants\n");
7477 : 0 : return false;
7478 : : }
7479 : : }
7480 : : /* Analysis for double-reduction is done on the outer
7481 : : loop PHI, nested cycles have no further restrictions. */
7482 : 1213 : STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7483 : : }
7484 : : else
7485 : 0 : STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7486 : 1213 : return true;
7487 : : }
7488 : :
7489 : 58918 : stmt_vec_info orig_stmt_of_analysis = stmt_info;
7490 : 58918 : stmt_vec_info phi_info = stmt_info;
7491 : 58918 : if (!is_a <gphi *> (stmt_info->stmt))
7492 : : {
7493 : 4600 : STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7494 : 4600 : return true;
7495 : : }
7496 : 54318 : if (slp_node)
7497 : : {
7498 : 1222 : slp_node_instance->reduc_phis = slp_node;
7499 : : /* ??? We're leaving slp_node to point to the PHIs, we only
7500 : : need it to get at the number of vector stmts which wasn't
7501 : : yet initialized for the instance root. */
7502 : : }
7503 : 54318 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7504 : : {
7505 : 206 : use_operand_p use_p;
7506 : 206 : gimple *use_stmt;
7507 : 206 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7508 : : &use_p, &use_stmt);
7509 : 206 : gcc_assert (res);
7510 : 206 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
7511 : : }
7512 : :
7513 : : /* PHIs should not participate in patterns. */
7514 : 54318 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7515 : 54318 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7516 : :
7517 : : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7518 : : and compute the reduction chain length. Discover the real
7519 : : reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7520 : 54318 : tree reduc_def
7521 : 54318 : = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7522 : : loop_latch_edge
7523 : : (gimple_bb (reduc_def_phi)->loop_father));
7524 : 54318 : unsigned reduc_chain_length = 0;
7525 : 54318 : bool only_slp_reduc_chain = true;
7526 : 54318 : stmt_info = NULL;
7527 : 54318 : slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7528 : 120732 : while (reduc_def != PHI_RESULT (reduc_def_phi))
7529 : : {
7530 : 66429 : stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7531 : 66429 : stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7532 : 66429 : if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7533 : : {
7534 : 0 : if (dump_enabled_p ())
7535 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7536 : : "reduction chain broken by patterns.\n");
7537 : 15 : return false;
7538 : : }
7539 : 66429 : if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7540 : 64761 : only_slp_reduc_chain = false;
7541 : : /* For epilogue generation live members of the chain need
7542 : : to point back to the PHI via their original stmt for
7543 : : info_for_reduction to work. For SLP we need to look at
7544 : : all lanes here - even though we only will vectorize from
7545 : : the SLP node with live lane zero the other live lanes also
7546 : : need to be identified as part of a reduction to be able
7547 : : to skip code generation for them. */
7548 : 66429 : if (slp_for_stmt_info)
7549 : : {
7550 : 22689 : for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7551 : 20016 : if (STMT_VINFO_LIVE_P (s))
7552 : 3870 : STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7553 : : }
7554 : 63756 : else if (STMT_VINFO_LIVE_P (vdef))
7555 : 52944 : STMT_VINFO_REDUC_DEF (def) = phi_info;
7556 : 66429 : gimple_match_op op;
7557 : 66429 : if (!gimple_extract_op (vdef->stmt, &op))
7558 : : {
7559 : 0 : if (dump_enabled_p ())
7560 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7561 : : "reduction chain includes unsupported"
7562 : : " statement type.\n");
7563 : 0 : return false;
7564 : : }
7565 : 66429 : if (CONVERT_EXPR_CODE_P (op.code))
7566 : : {
7567 : 9897 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7568 : : {
7569 : 15 : if (dump_enabled_p ())
7570 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7571 : : "conversion in the reduction chain.\n");
7572 : 15 : return false;
7573 : : }
7574 : : }
7575 : 56532 : else if (!stmt_info)
7576 : : /* First non-conversion stmt. */
7577 : 66414 : stmt_info = vdef;
7578 : 66414 : reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7579 : 66414 : reduc_chain_length++;
7580 : 66414 : if (!stmt_info && slp_node)
7581 : 60 : slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7582 : : }
7583 : : /* PHIs should not participate in patterns. */
7584 : 54303 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7585 : :
7586 : 54303 : if (nested_in_vect_loop_p (loop, stmt_info))
7587 : : {
7588 : 203 : loop = loop->inner;
7589 : 203 : nested_cycle = true;
7590 : : }
7591 : :
7592 : : /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7593 : : element. */
7594 : 54303 : if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7595 : : {
7596 : 326 : gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7597 : : stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7598 : : }
7599 : 54303 : if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7600 : 326 : gcc_assert (slp_node
7601 : : && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7602 : :
7603 : : /* 1. Is vectorizable reduction? */
7604 : : /* Not supportable if the reduction variable is used in the loop, unless
7605 : : it's a reduction chain. */
7606 : 54303 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7607 : 54303 : && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7608 : : return false;
7609 : :
7610 : : /* Reductions that are not used even in an enclosing outer-loop,
7611 : : are expected to be "live" (used out of the loop). */
7612 : 54303 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7613 : 0 : && !STMT_VINFO_LIVE_P (stmt_info))
7614 : : return false;
7615 : :
7616 : : /* 2. Has this been recognized as a reduction pattern?
7617 : :
7618 : : Check if STMT represents a pattern that has been recognized
7619 : : in earlier analysis stages. For stmts that represent a pattern,
7620 : : the STMT_VINFO_RELATED_STMT field records the last stmt in
7621 : : the original sequence that constitutes the pattern. */
7622 : :
7623 : 54303 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7624 : 54303 : if (orig_stmt_info)
7625 : : {
7626 : 2619 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7627 : 2619 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7628 : : }
7629 : :
7630 : : /* 3. Check the operands of the operation. The first operands are defined
7631 : : inside the loop body. The last operand is the reduction variable,
7632 : : which is defined by the loop-header-phi. */
7633 : :
7634 : 54303 : tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7635 : 54303 : STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7636 : 54303 : gimple_match_op op;
7637 : 54303 : if (!gimple_extract_op (stmt_info->stmt, &op))
7638 : 0 : gcc_unreachable ();
7639 : 54303 : bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7640 : 54137 : || op.code == WIDEN_SUM_EXPR
7641 : 108440 : || op.code == SAD_EXPR);
7642 : :
7643 : 54303 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7644 : 14037 : && !SCALAR_FLOAT_TYPE_P (op.type))
7645 : : return false;
7646 : :
7647 : : /* Do not try to vectorize bit-precision reductions. */
7648 : 54303 : if (!type_has_mode_precision_p (op.type))
7649 : : return false;
7650 : :
7651 : : /* For lane-reducing ops we're reducing the number of reduction PHIs
7652 : : which means the only use of that may be in the lane-reducing operation. */
7653 : 53286 : if (lane_reduc_code_p
7654 : 53286 : && reduc_chain_length != 1
7655 : 12 : && !only_slp_reduc_chain)
7656 : : {
7657 : 0 : if (dump_enabled_p ())
7658 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7659 : : "lane-reducing reduction with extra stmts.\n");
7660 : 0 : return false;
7661 : : }
7662 : :
7663 : : /* All uses but the last are expected to be defined in the loop.
7664 : : The last use is the reduction variable. In case of nested cycle this
7665 : : assumption is not true: we use reduc_index to record the index of the
7666 : : reduction variable. */
7667 : 53286 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7668 : 53286 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7669 : : /* We need to skip an extra operand for COND_EXPRs with embedded
7670 : : comparison. */
7671 : 53286 : unsigned opno_adjust = 0;
7672 : 53286 : if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7673 : 53286 : opno_adjust = 1;
7674 : 166645 : for (i = 0; i < (int) op.num_ops; i++)
7675 : : {
7676 : : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7677 : 113389 : if (i == 0 && op.code == COND_EXPR)
7678 : 56904 : continue;
7679 : :
7680 : 112718 : stmt_vec_info def_stmt_info;
7681 : 112718 : enum vect_def_type dt;
7682 : 112718 : if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7683 : 112718 : i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7684 : 112718 : &vectype_op[i], &def_stmt_info))
7685 : : {
7686 : 0 : if (dump_enabled_p ())
7687 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7688 : : "use not simple.\n");
7689 : 30 : return false;
7690 : : }
7691 : 112718 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7692 : 53262 : continue;
7693 : :
7694 : : /* For an IFN_COND_OP we might hit the reduction definition operand
7695 : : twice (once as definition, once as else). */
7696 : 59456 : if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7697 : 2971 : continue;
7698 : :
7699 : : /* There should be only one cycle def in the stmt, the one
7700 : : leading to reduc_def. */
7701 : 56485 : if (VECTORIZABLE_CYCLE_DEF (dt))
7702 : : return false;
7703 : :
7704 : 56455 : if (!vectype_op[i])
7705 : 3357 : vectype_op[i]
7706 : 3357 : = get_vectype_for_scalar_type (loop_vinfo,
7707 : 3357 : TREE_TYPE (op.ops[i]), slp_op[i]);
7708 : :
7709 : : /* To properly compute ncopies we are interested in the widest
7710 : : non-reduction input type in case we're looking at a widening
7711 : : accumulation that we later handle in vect_transform_reduction. */
7712 : 56455 : if (lane_reduc_code_p
7713 : 468 : && vectype_op[i]
7714 : 56923 : && (!vectype_in
7715 : 234 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7716 : 56455 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7717 : 234 : vectype_in = vectype_op[i];
7718 : :
7719 : : /* Record how the non-reduction-def value of COND_EXPR is defined.
7720 : : ??? For a chain of multiple CONDs we'd have to match them up all. */
7721 : 56455 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7722 : : {
7723 : 642 : if (dt == vect_constant_def)
7724 : : {
7725 : 73 : cond_reduc_dt = dt;
7726 : 73 : cond_reduc_val = op.ops[i];
7727 : : }
7728 : 569 : else if (dt == vect_induction_def
7729 : 236 : && def_stmt_info
7730 : 805 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7731 : : {
7732 : 106 : cond_reduc_dt = dt;
7733 : 106 : cond_stmt_vinfo = def_stmt_info;
7734 : : }
7735 : : }
7736 : : }
7737 : 53256 : if (!vectype_in)
7738 : 53022 : vectype_in = STMT_VINFO_VECTYPE (phi_info);
7739 : 53256 : STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7740 : :
7741 : 53256 : enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7742 : 53256 : STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7743 : : /* If we have a condition reduction, see if we can simplify it further. */
7744 : 53256 : if (v_reduc_type == COND_REDUCTION)
7745 : : {
7746 : 659 : if (slp_node)
7747 : : return false;
7748 : :
7749 : : /* When the condition uses the reduction value in the condition, fail. */
7750 : 659 : if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7751 : : {
7752 : 0 : if (dump_enabled_p ())
7753 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7754 : : "condition depends on previous iteration\n");
7755 : 0 : return false;
7756 : : }
7757 : :
7758 : 659 : if (reduc_chain_length == 1
7759 : 659 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7760 : : OPTIMIZE_FOR_SPEED)
7761 : 630 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7762 : : vectype_in,
7763 : : OPTIMIZE_FOR_SPEED)))
7764 : : {
7765 : 0 : if (dump_enabled_p ())
7766 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7767 : : "optimizing condition reduction with"
7768 : : " FOLD_EXTRACT_LAST.\n");
7769 : 0 : STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7770 : : }
7771 : 659 : else if (cond_reduc_dt == vect_induction_def)
7772 : : {
7773 : 103 : tree base
7774 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7775 : 103 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7776 : :
7777 : 103 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7778 : : && TREE_CODE (step) == INTEGER_CST);
7779 : 103 : cond_reduc_val = NULL_TREE;
7780 : 103 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7781 : 103 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7782 : 103 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7783 : : ;
7784 : : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7785 : : above base; punt if base is the minimum value of the type for
7786 : : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7787 : 97 : else if (tree_int_cst_sgn (step) == -1)
7788 : : {
7789 : 20 : cond_reduc_op_code = MIN_EXPR;
7790 : 20 : if (tree_int_cst_sgn (base) == -1)
7791 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7792 : 20 : else if (tree_int_cst_lt (base,
7793 : 20 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7794 : 20 : cond_reduc_val
7795 : 20 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7796 : : }
7797 : : else
7798 : : {
7799 : 77 : cond_reduc_op_code = MAX_EXPR;
7800 : 77 : if (tree_int_cst_sgn (base) == 1)
7801 : 12 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7802 : 65 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7803 : : base))
7804 : 65 : cond_reduc_val
7805 : 65 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7806 : : }
7807 : 97 : if (cond_reduc_val)
7808 : : {
7809 : 97 : if (dump_enabled_p ())
7810 : 64 : dump_printf_loc (MSG_NOTE, vect_location,
7811 : : "condition expression based on "
7812 : : "integer induction.\n");
7813 : 97 : STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7814 : 97 : STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7815 : 97 : = cond_reduc_val;
7816 : 97 : STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7817 : : }
7818 : : }
7819 : 556 : else if (cond_reduc_dt == vect_constant_def)
7820 : : {
7821 : 73 : enum vect_def_type cond_initial_dt;
7822 : 73 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7823 : 73 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7824 : 73 : if (cond_initial_dt == vect_constant_def
7825 : 85 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7826 : 12 : TREE_TYPE (cond_reduc_val)))
7827 : : {
7828 : 12 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7829 : : cond_initial_val, cond_reduc_val);
7830 : 12 : if (e && (integer_onep (e) || integer_zerop (e)))
7831 : : {
7832 : 12 : if (dump_enabled_p ())
7833 : 10 : dump_printf_loc (MSG_NOTE, vect_location,
7834 : : "condition expression based on "
7835 : : "compile time constant.\n");
7836 : : /* Record reduction code at analysis stage. */
7837 : 12 : STMT_VINFO_REDUC_CODE (reduc_info)
7838 : 20 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7839 : 12 : STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7840 : : }
7841 : : }
7842 : : }
7843 : : }
7844 : :
7845 : 53256 : if (STMT_VINFO_LIVE_P (phi_info))
7846 : : return false;
7847 : :
7848 : 53256 : if (slp_node)
7849 : : ncopies = 1;
7850 : : else
7851 : 52059 : ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7852 : :
7853 : 52059 : gcc_assert (ncopies >= 1);
7854 : :
7855 : 53256 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7856 : :
7857 : 53256 : if (nested_cycle)
7858 : : {
7859 : 185 : gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7860 : : == vect_double_reduction_def);
7861 : : double_reduc = true;
7862 : : }
7863 : :
7864 : : /* 4.2. Check support for the epilog operation.
7865 : :
7866 : : If STMT represents a reduction pattern, then the type of the
7867 : : reduction variable may be different than the type of the rest
7868 : : of the arguments. For example, consider the case of accumulation
7869 : : of shorts into an int accumulator; The original code:
7870 : : S1: int_a = (int) short_a;
7871 : : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7872 : :
7873 : : was replaced with:
7874 : : STMT: int_acc = widen_sum <short_a, int_acc>
7875 : :
7876 : : This means that:
7877 : : 1. The tree-code that is used to create the vector operation in the
7878 : : epilog code (that reduces the partial results) is not the
7879 : : tree-code of STMT, but is rather the tree-code of the original
7880 : : stmt from the pattern that STMT is replacing. I.e, in the example
7881 : : above we want to use 'widen_sum' in the loop, but 'plus' in the
7882 : : epilog.
7883 : : 2. The type (mode) we use to check available target support
7884 : : for the vector operation to be created in the *epilog*, is
7885 : : determined by the type of the reduction variable (in the example
7886 : : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7887 : : However the type (mode) we use to check available target support
7888 : : for the vector operation to be created *inside the loop*, is
7889 : : determined by the type of the other arguments to STMT (in the
7890 : : example we'd check this: optab_handler (widen_sum_optab,
7891 : : vect_short_mode)).
7892 : :
7893 : : This is contrary to "regular" reductions, in which the types of all
7894 : : the arguments are the same as the type of the reduction variable.
7895 : : For "regular" reductions we can therefore use the same vector type
7896 : : (and also the same tree-code) when generating the epilog code and
7897 : : when generating the code inside the loop. */
7898 : :
7899 : 53256 : code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7900 : :
7901 : : /* If conversion might have created a conditional operation like
7902 : : IFN_COND_ADD already. Use the internal code for the following checks. */
7903 : 53256 : if (orig_code.is_internal_fn ())
7904 : : {
7905 : 3011 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7906 : 3011 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7907 : : }
7908 : :
7909 : 53256 : STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7910 : :
7911 : 53256 : vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7912 : 53256 : if (reduction_type == TREE_CODE_REDUCTION)
7913 : : {
7914 : : /* Check whether it's ok to change the order of the computation.
7915 : : Generally, when vectorizing a reduction we change the order of the
7916 : : computation. This may change the behavior of the program in some
7917 : : cases, so we need to check that this is ok. One exception is when
7918 : : vectorizing an outer-loop: the inner-loop is executed sequentially,
7919 : : and therefore vectorizing reductions in the inner-loop during
7920 : : outer-loop vectorization is safe. Likewise when we are vectorizing
7921 : : a series of reductions using SLP and the VF is one the reductions
7922 : : are performed in scalar order. */
7923 : 52572 : if (slp_node
7924 : 1197 : && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7925 : 53443 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7926 : : ;
7927 : 52327 : else if (needs_fold_left_reduction_p (op.type, orig_code))
7928 : : {
7929 : : /* When vectorizing a reduction chain w/o SLP the reduction PHI
7930 : : is not directy used in stmt. */
7931 : 2921 : if (!only_slp_reduc_chain
7932 : 2921 : && reduc_chain_length != 1)
7933 : : {
7934 : 32 : if (dump_enabled_p ())
7935 : 10 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7936 : : "in-order reduction chain without SLP.\n");
7937 : 32 : return false;
7938 : : }
7939 : 2889 : STMT_VINFO_REDUC_TYPE (reduc_info)
7940 : 2889 : = reduction_type = FOLD_LEFT_REDUCTION;
7941 : : }
7942 : 49406 : else if (!commutative_binary_op_p (orig_code, op.type)
7943 : 49406 : || !associative_binary_op_p (orig_code, op.type))
7944 : : {
7945 : 71 : if (dump_enabled_p ())
7946 : 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7947 : : "reduction: not commutative/associative\n");
7948 : 71 : return false;
7949 : : }
7950 : : }
7951 : :
7952 : 53153 : if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7953 : 3702 : && ncopies > 1)
7954 : : {
7955 : 443 : if (dump_enabled_p ())
7956 : 87 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7957 : : "multiple types in double reduction or condition "
7958 : : "reduction or fold-left reduction.\n");
7959 : 443 : return false;
7960 : : }
7961 : :
7962 : 52710 : internal_fn reduc_fn = IFN_LAST;
7963 : 52710 : if (reduction_type == TREE_CODE_REDUCTION
7964 : 52710 : || reduction_type == FOLD_LEFT_REDUCTION
7965 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
7966 : 375 : || reduction_type == CONST_COND_REDUCTION)
7967 : : {
7968 : 52440 : if (reduction_type == FOLD_LEFT_REDUCTION
7969 : 52335 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
7970 : 49650 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7971 : : {
7972 : 52118 : if (reduc_fn != IFN_LAST
7973 : 52118 : && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7974 : : OPTIMIZE_FOR_SPEED))
7975 : : {
7976 : 8404 : if (dump_enabled_p ())
7977 : 960 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7978 : : "reduc op not supported by target.\n");
7979 : :
7980 : 8404 : reduc_fn = IFN_LAST;
7981 : : }
7982 : : }
7983 : : else
7984 : : {
7985 : 322 : if (!nested_cycle || double_reduc)
7986 : : {
7987 : 322 : if (dump_enabled_p ())
7988 : 24 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7989 : : "no reduc code for scalar code.\n");
7990 : :
7991 : 322 : return false;
7992 : : }
7993 : : }
7994 : : }
7995 : 270 : else if (reduction_type == COND_REDUCTION)
7996 : : {
7997 : 270 : int scalar_precision
7998 : 270 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7999 : 270 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
8000 : 270 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8001 : : vectype_out);
8002 : :
8003 : 270 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8004 : : OPTIMIZE_FOR_SPEED))
8005 : 9 : reduc_fn = IFN_REDUC_MAX;
8006 : : }
8007 : 52388 : STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8008 : :
8009 : 52388 : if (reduction_type != EXTRACT_LAST_REDUCTION
8010 : 52388 : && (!nested_cycle || double_reduc)
8011 : 52388 : && reduc_fn == IFN_LAST
8012 : 62799 : && !nunits_out.is_constant ())
8013 : : {
8014 : 0 : if (dump_enabled_p ())
8015 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8016 : : "missing target support for reduction on"
8017 : : " variable-length vectors.\n");
8018 : 0 : return false;
8019 : : }
8020 : :
8021 : : /* For SLP reductions, see if there is a neutral value we can use. */
8022 : 52388 : tree neutral_op = NULL_TREE;
8023 : 52388 : if (slp_node)
8024 : : {
8025 : 1191 : tree initial_value = NULL_TREE;
8026 : 1191 : if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8027 : 320 : initial_value = vect_phi_initial_value (reduc_def_phi);
8028 : 1191 : neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8029 : : orig_code, initial_value);
8030 : : }
8031 : :
8032 : 52388 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8033 : : {
8034 : : /* We can't support in-order reductions of code such as this:
8035 : :
8036 : : for (int i = 0; i < n1; ++i)
8037 : : for (int j = 0; j < n2; ++j)
8038 : : l += a[j];
8039 : :
8040 : : since GCC effectively transforms the loop when vectorizing:
8041 : :
8042 : : for (int i = 0; i < n1 / VF; ++i)
8043 : : for (int j = 0; j < n2; ++j)
8044 : : for (int k = 0; k < VF; ++k)
8045 : : l += a[j];
8046 : :
8047 : : which is a reassociation of the original operation. */
8048 : 32 : if (dump_enabled_p ())
8049 : 10 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8050 : : "in-order double reduction not supported.\n");
8051 : :
8052 : 32 : return false;
8053 : : }
8054 : :
8055 : 52356 : if (reduction_type == FOLD_LEFT_REDUCTION
8056 : 52356 : && slp_node
8057 : 52356 : && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8058 : : {
8059 : : /* We cannot use in-order reductions in this case because there is
8060 : : an implicit reassociation of the operations involved. */
8061 : 57 : if (dump_enabled_p ())
8062 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8063 : : "in-order unchained SLP reductions not supported.\n");
8064 : 57 : return false;
8065 : : }
8066 : :
8067 : : /* For double reductions, and for SLP reductions with a neutral value,
8068 : : we construct a variable-length initial vector by loading a vector
8069 : : full of the neutral value and then shift-and-inserting the start
8070 : : values into the low-numbered elements. */
8071 : 52299 : if ((double_reduc || neutral_op)
8072 : 1195 : && !nunits_out.is_constant ()
8073 : 52299 : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8074 : : vectype_out, OPTIMIZE_FOR_SPEED))
8075 : : {
8076 : 0 : if (dump_enabled_p ())
8077 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8078 : : "reduction on variable-length vectors requires"
8079 : : " target support for a vector-shift-and-insert"
8080 : : " operation.\n");
8081 : 0 : return false;
8082 : : }
8083 : :
8084 : : /* Check extra constraints for variable-length unchained SLP reductions. */
8085 : 52299 : if (slp_node
8086 : 1134 : && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8087 : 53113 : && !nunits_out.is_constant ())
8088 : : {
8089 : : /* We checked above that we could build the initial vector when
8090 : : there's a neutral element value. Check here for the case in
8091 : : which each SLP statement has its own initial value and in which
8092 : : that value needs to be repeated for every instance of the
8093 : : statement within the initial vector. */
8094 : 0 : unsigned int group_size = SLP_TREE_LANES (slp_node);
8095 : 0 : if (!neutral_op
8096 : 0 : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8097 : 0 : TREE_TYPE (vectype_out)))
8098 : : {
8099 : 0 : if (dump_enabled_p ())
8100 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8101 : : "unsupported form of SLP reduction for"
8102 : : " variable-length vectors: cannot build"
8103 : : " initial vector.\n");
8104 : 0 : return false;
8105 : : }
8106 : : /* The epilogue code relies on the number of elements being a multiple
8107 : : of the group size. The duplicate-and-interleave approach to setting
8108 : : up the initial vector does too. */
8109 : 0 : if (!multiple_p (nunits_out, group_size))
8110 : : {
8111 : 0 : if (dump_enabled_p ())
8112 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8113 : : "unsupported form of SLP reduction for"
8114 : : " variable-length vectors: the vector size"
8115 : : " is not a multiple of the number of results.\n");
8116 : 0 : return false;
8117 : : }
8118 : : }
8119 : :
8120 : 52299 : if (reduction_type == COND_REDUCTION)
8121 : : {
8122 : 270 : widest_int ni;
8123 : :
8124 : 270 : if (! max_loop_iterations (loop, &ni))
8125 : : {
8126 : 0 : if (dump_enabled_p ())
8127 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
8128 : : "loop count not known, cannot create cond "
8129 : : "reduction.\n");
8130 : 0 : return false;
8131 : : }
8132 : : /* Convert backedges to iterations. */
8133 : 270 : ni += 1;
8134 : :
8135 : : /* The additional index will be the same type as the condition. Check
8136 : : that the loop can fit into this less one (because we'll use up the
8137 : : zero slot for when there are no matches). */
8138 : 270 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8139 : 270 : if (wi::geu_p (ni, wi::to_widest (max_index)))
8140 : : {
8141 : 48 : if (dump_enabled_p ())
8142 : 21 : dump_printf_loc (MSG_NOTE, vect_location,
8143 : : "loop size is greater than data size.\n");
8144 : 48 : return false;
8145 : : }
8146 : 270 : }
8147 : :
8148 : : /* In case the vectorization factor (VF) is bigger than the number
8149 : : of elements that we can fit in a vectype (nunits), we have to generate
8150 : : more than one vector stmt - i.e - we need to "unroll" the
8151 : : vector stmt by a factor VF/nunits. For more details see documentation
8152 : : in vectorizable_operation. */
8153 : :
8154 : : /* If the reduction is used in an outer loop we need to generate
8155 : : VF intermediate results, like so (e.g. for ncopies=2):
8156 : : r0 = phi (init, r0)
8157 : : r1 = phi (init, r1)
8158 : : r0 = x0 + r0;
8159 : : r1 = x1 + r1;
8160 : : (i.e. we generate VF results in 2 registers).
8161 : : In this case we have a separate def-use cycle for each copy, and therefore
8162 : : for each copy we get the vector def for the reduction variable from the
8163 : : respective phi node created for this copy.
8164 : :
8165 : : Otherwise (the reduction is unused in the loop nest), we can combine
8166 : : together intermediate results, like so (e.g. for ncopies=2):
8167 : : r = phi (init, r)
8168 : : r = x0 + r;
8169 : : r = x1 + r;
8170 : : (i.e. we generate VF/2 results in a single register).
8171 : : In this case for each copy we get the vector def for the reduction variable
8172 : : from the vectorized reduction operation generated in the previous iteration.
8173 : :
8174 : : This only works when we see both the reduction PHI and its only consumer
8175 : : in vectorizable_reduction and there are no intermediate stmts
8176 : : participating. When unrolling we want each unrolled iteration to have its
8177 : : own reduction accumulator since one of the main goals of unrolling a
8178 : : reduction is to reduce the aggregate loop-carried latency. */
8179 : 52251 : if (ncopies > 1
8180 : 5482 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8181 : 5482 : && reduc_chain_length == 1
8182 : 5272 : && loop_vinfo->suggested_unroll_factor == 1)
8183 : 52251 : single_defuse_cycle = true;
8184 : :
8185 : 52251 : if (single_defuse_cycle || lane_reduc_code_p)
8186 : : {
8187 : 5491 : gcc_assert (op.code != COND_EXPR);
8188 : :
8189 : : /* 4. Supportable by target? */
8190 : 5491 : bool ok = true;
8191 : :
8192 : : /* 4.1. check support for the operation in the loop
8193 : :
8194 : : This isn't necessary for the lane reduction codes, since they
8195 : : can only be produced by pattern matching, and it's up to the
8196 : : pattern matcher to test for support. The main reason for
8197 : : specifically skipping this step is to avoid rechecking whether
8198 : : mixed-sign dot-products can be implemented using signed
8199 : : dot-products. */
8200 : 5491 : machine_mode vec_mode = TYPE_MODE (vectype_in);
8201 : 5491 : if (!lane_reduc_code_p
8202 : 5491 : && !directly_supported_p (op.code, vectype_in, optab_vector))
8203 : : {
8204 : 2044 : if (dump_enabled_p ())
8205 : 160 : dump_printf (MSG_NOTE, "op not supported by target.\n");
8206 : 3354 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8207 : 2044 : || !vect_can_vectorize_without_simd_p (op.code))
8208 : : ok = false;
8209 : : else
8210 : 886 : if (dump_enabled_p ())
8211 : 21 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8212 : : }
8213 : :
8214 : 5491 : if (vect_emulated_vector_p (vectype_in)
8215 : 5491 : && !vect_can_vectorize_without_simd_p (op.code))
8216 : : {
8217 : 0 : if (dump_enabled_p ())
8218 : 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
8219 : 0 : return false;
8220 : : }
8221 : :
8222 : : /* lane-reducing operations have to go through vect_transform_reduction.
8223 : : For the other cases try without the single cycle optimization. */
8224 : 5491 : if (!ok)
8225 : : {
8226 : 1158 : if (lane_reduc_code_p)
8227 : : return false;
8228 : : else
8229 : : single_defuse_cycle = false;
8230 : : }
8231 : : }
8232 : 52251 : STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8233 : :
8234 : : /* If the reduction stmt is one of the patterns that have lane
8235 : : reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8236 : 52251 : if ((ncopies > 1 && ! single_defuse_cycle)
8237 : 1368 : && lane_reduc_code_p)
8238 : : {
8239 : 0 : if (dump_enabled_p ())
8240 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8241 : : "multi def-use cycle not possible for lane-reducing "
8242 : : "reduction operation\n");
8243 : 0 : return false;
8244 : : }
8245 : :
8246 : 52251 : if (slp_node
8247 : 1122 : && !(!single_defuse_cycle
8248 : 1134 : && !lane_reduc_code_p
8249 : : && reduction_type != FOLD_LEFT_REDUCTION))
8250 : 264 : for (i = 0; i < (int) op.num_ops; i++)
8251 : 180 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8252 : : {
8253 : 0 : if (dump_enabled_p ())
8254 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8255 : : "incompatible vector types for invariants\n");
8256 : 0 : return false;
8257 : : }
8258 : :
8259 : 1134 : if (slp_node)
8260 : 1134 : vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8261 : : else
8262 : : vec_num = 1;
8263 : :
8264 : 52251 : vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8265 : : reduction_type, ncopies, cost_vec);
8266 : : /* Cost the reduction op inside the loop if transformed via
8267 : : vect_transform_reduction. Otherwise this is costed by the
8268 : : separate vectorizable_* routines. */
8269 : 52251 : if (single_defuse_cycle || lane_reduc_code_p)
8270 : : {
8271 : 4333 : int factor = 1;
8272 : 4333 : if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8273 : : /* Three dot-products and a subtraction. */
8274 : 36 : factor = 4;
8275 : 4333 : record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8276 : : stmt_info, 0, vect_body);
8277 : : }
8278 : :
8279 : 52251 : if (dump_enabled_p ()
8280 : 52251 : && reduction_type == FOLD_LEFT_REDUCTION)
8281 : 220 : dump_printf_loc (MSG_NOTE, vect_location,
8282 : : "using an in-order (fold-left) reduction.\n");
8283 : 52251 : STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8284 : : /* All but single defuse-cycle optimized, lane-reducing and fold-left
8285 : : reductions go through their own vectorizable_* routines. */
8286 : 52251 : if (!single_defuse_cycle
8287 : : && !lane_reduc_code_p
8288 : 47918 : && reduction_type != FOLD_LEFT_REDUCTION)
8289 : : {
8290 : 45539 : stmt_vec_info tem
8291 : 45539 : = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8292 : 45539 : if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8293 : : {
8294 : 236 : gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8295 : : tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8296 : : }
8297 : 45539 : STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8298 : 45539 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8299 : 45539 : }
8300 : 6712 : else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8301 : : {
8302 : 6 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8303 : 6 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8304 : 6 : internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8305 : :
8306 : 6 : if (reduction_type != FOLD_LEFT_REDUCTION
8307 : 0 : && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8308 : 6 : && (cond_fn == IFN_LAST
8309 : 0 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8310 : : OPTIMIZE_FOR_SPEED)))
8311 : : {
8312 : 0 : if (dump_enabled_p ())
8313 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8314 : : "can't operate on partial vectors because"
8315 : : " no conditional operation is available.\n");
8316 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8317 : : }
8318 : 6 : else if (reduction_type == FOLD_LEFT_REDUCTION
8319 : 6 : && reduc_fn == IFN_LAST
8320 : 12 : && !expand_vec_cond_expr_p (vectype_in,
8321 : : truth_type_for (vectype_in),
8322 : : SSA_NAME))
8323 : : {
8324 : 0 : if (dump_enabled_p ())
8325 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8326 : : "can't operate on partial vectors because"
8327 : : " no conditional operation is available.\n");
8328 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8329 : : }
8330 : 6 : else if (reduction_type == FOLD_LEFT_REDUCTION
8331 : 6 : && internal_fn_mask_index (reduc_fn) == -1
8332 : 6 : && FLOAT_TYPE_P (vectype_in)
8333 : 12 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8334 : : {
8335 : 0 : if (dump_enabled_p ())
8336 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8337 : : "can't operate on partial vectors because"
8338 : : " signed zeros cannot be preserved.\n");
8339 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8340 : : }
8341 : : else
8342 : : {
8343 : 6 : internal_fn mask_reduc_fn
8344 : 6 : = get_masked_reduction_fn (reduc_fn, vectype_in);
8345 : :
8346 : 6 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8347 : 0 : vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8348 : : vectype_in, 1);
8349 : : else
8350 : 6 : vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8351 : : vectype_in, NULL);
8352 : : }
8353 : : }
8354 : : return true;
8355 : : }
8356 : :
8357 : : /* STMT_INFO is a dot-product reduction whose multiplication operands
8358 : : have different signs. Emit a sequence to emulate the operation
8359 : : using a series of signed DOT_PROD_EXPRs and return the last
8360 : : statement generated. VEC_DEST is the result of the vector operation
8361 : : and VOP lists its inputs. */
8362 : :
8363 : : static gassign *
8364 : 18 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8365 : : gimple_stmt_iterator *gsi, tree vec_dest,
8366 : : tree vop[3])
8367 : : {
8368 : 18 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8369 : 18 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8370 : 18 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
8371 : 18 : gimple *new_stmt;
8372 : :
8373 : : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8374 : 18 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8375 : 12 : std::swap (vop[0], vop[1]);
8376 : :
8377 : : /* Convert all inputs to signed types. */
8378 : 72 : for (int i = 0; i < 3; ++i)
8379 : 54 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8380 : : {
8381 : 30 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8382 : 30 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8383 : 30 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8384 : 30 : vop[i] = tmp;
8385 : : }
8386 : :
8387 : : /* In the comments below we assume 8-bit inputs for simplicity,
8388 : : but the approach works for any full integer type. */
8389 : :
8390 : : /* Create a vector of -128. */
8391 : 18 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8392 : 18 : tree min_narrow = build_vector_from_val (narrow_vectype,
8393 : : min_narrow_elttype);
8394 : :
8395 : : /* Create a vector of 64. */
8396 : 18 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8397 : 18 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8398 : 18 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8399 : :
8400 : : /* Emit: SUB_RES = VOP[0] - 128. */
8401 : 18 : tree sub_res = make_ssa_name (narrow_vectype);
8402 : 18 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8403 : 18 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8404 : :
8405 : : /* Emit:
8406 : :
8407 : : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8408 : : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8409 : : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8410 : :
8411 : : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8412 : : Doing the two 64 * y steps first allows more time to compute x. */
8413 : 18 : tree stage1 = make_ssa_name (wide_vectype);
8414 : 18 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8415 : : vop[1], half_narrow, vop[2]);
8416 : 18 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8417 : :
8418 : 18 : tree stage2 = make_ssa_name (wide_vectype);
8419 : 18 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8420 : : vop[1], half_narrow, stage1);
8421 : 18 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8422 : :
8423 : 18 : tree stage3 = make_ssa_name (wide_vectype);
8424 : 18 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8425 : : sub_res, vop[1], stage2);
8426 : 18 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8427 : :
8428 : : /* Convert STAGE3 to the reduction type. */
8429 : 18 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8430 : 18 : }
8431 : :
8432 : : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8433 : : value. */
8434 : :
8435 : : bool
8436 : 1650 : vect_transform_reduction (loop_vec_info loop_vinfo,
8437 : : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8438 : : gimple **vec_stmt, slp_tree slp_node)
8439 : : {
8440 : 1650 : tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8441 : 1650 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8442 : 1650 : int i;
8443 : 1650 : int ncopies;
8444 : 1650 : int vec_num;
8445 : :
8446 : 1650 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8447 : 1650 : gcc_assert (reduc_info->is_reduc_info);
8448 : :
8449 : 1650 : if (nested_in_vect_loop_p (loop, stmt_info))
8450 : : {
8451 : 0 : loop = loop->inner;
8452 : 0 : gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8453 : : }
8454 : :
8455 : 1650 : gimple_match_op op;
8456 : 1650 : if (!gimple_extract_op (stmt_info->stmt, &op))
8457 : 0 : gcc_unreachable ();
8458 : :
8459 : : /* All uses but the last are expected to be defined in the loop.
8460 : : The last use is the reduction variable. In case of nested cycle this
8461 : : assumption is not true: we use reduc_index to record the index of the
8462 : : reduction variable. */
8463 : 1650 : stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8464 : 1650 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8465 : 1650 : int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8466 : 1650 : tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8467 : :
8468 : 1650 : if (slp_node)
8469 : : {
8470 : 60 : ncopies = 1;
8471 : 60 : vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8472 : : }
8473 : : else
8474 : : {
8475 : 1590 : ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8476 : 1590 : vec_num = 1;
8477 : : }
8478 : :
8479 : 1650 : code_helper code = canonicalize_code (op.code, op.type);
8480 : 1650 : internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8481 : :
8482 : 1650 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8483 : 1650 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8484 : 1650 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8485 : :
8486 : : /* Transform. */
8487 : 1650 : tree new_temp = NULL_TREE;
8488 : 1650 : auto_vec<tree> vec_oprnds0;
8489 : 1650 : auto_vec<tree> vec_oprnds1;
8490 : 1650 : auto_vec<tree> vec_oprnds2;
8491 : 1650 : tree def0;
8492 : :
8493 : 1650 : if (dump_enabled_p ())
8494 : 508 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8495 : :
8496 : : /* FORNOW: Multiple types are not supported for condition. */
8497 : 1650 : if (code == COND_EXPR)
8498 : 0 : gcc_assert (ncopies == 1);
8499 : :
8500 : : /* A binary COND_OP reduction must have the same definition and else
8501 : : value. */
8502 : 1888 : bool cond_fn_p = code.is_internal_fn ()
8503 : 238 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8504 : 238 : if (cond_fn_p)
8505 : : {
8506 : 238 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8507 : : || code == IFN_COND_MUL || code == IFN_COND_AND
8508 : : || code == IFN_COND_IOR || code == IFN_COND_XOR);
8509 : 238 : gcc_assert (op.num_ops == 4
8510 : : && (op.ops[reduc_index]
8511 : : == op.ops[internal_fn_else_index ((internal_fn) code)]));
8512 : : }
8513 : :
8514 : 1650 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8515 : :
8516 : 1650 : vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8517 : 1650 : if (reduction_type == FOLD_LEFT_REDUCTION)
8518 : : {
8519 : 641 : internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8520 : 641 : gcc_assert (code.is_tree_code () || cond_fn_p);
8521 : 641 : return vectorize_fold_left_reduction
8522 : 641 : (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8523 : 641 : code, reduc_fn, op.ops, op.num_ops, vectype_in,
8524 : 641 : reduc_index, masks, lens);
8525 : : }
8526 : :
8527 : 1009 : bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8528 : 1009 : gcc_assert (single_defuse_cycle
8529 : : || code == DOT_PROD_EXPR
8530 : : || code == WIDEN_SUM_EXPR
8531 : : || code == SAD_EXPR);
8532 : :
8533 : : /* Create the destination vector */
8534 : 1009 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8535 : 1009 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8536 : :
8537 : : /* Get NCOPIES vector definitions for all operands except the reduction
8538 : : definition. */
8539 : 1009 : if (!cond_fn_p)
8540 : : {
8541 : 1293 : vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8542 : 775 : single_defuse_cycle && reduc_index == 0
8543 : : ? NULL_TREE : op.ops[0], &vec_oprnds0,
8544 : 775 : single_defuse_cycle && reduc_index == 1
8545 : : ? NULL_TREE : op.ops[1], &vec_oprnds1,
8546 : 775 : op.num_ops == 3
8547 : 134 : && !(single_defuse_cycle && reduc_index == 2)
8548 : : ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8549 : : }
8550 : : else
8551 : : {
8552 : : /* For a conditional operation pass the truth type as mask
8553 : : vectype. */
8554 : 234 : gcc_assert (single_defuse_cycle
8555 : : && (reduc_index == 1 || reduc_index == 2));
8556 : 234 : vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8557 : : op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8558 : : reduc_index == 1 ? NULL_TREE : op.ops[1],
8559 : : NULL_TREE, &vec_oprnds1,
8560 : : reduc_index == 2 ? NULL_TREE : op.ops[2],
8561 : : NULL_TREE, &vec_oprnds2);
8562 : : }
8563 : :
8564 : : /* For single def-use cycles get one copy of the vectorized reduction
8565 : : definition. */
8566 : 1009 : if (single_defuse_cycle)
8567 : : {
8568 : 881 : gcc_assert (!slp_node);
8569 : 887 : vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8570 : : op.ops[reduc_index],
8571 : : reduc_index == 0 ? &vec_oprnds0
8572 : : : (reduc_index == 1 ? &vec_oprnds1
8573 : : : &vec_oprnds2));
8574 : : }
8575 : :
8576 : 1009 : bool emulated_mixed_dot_prod
8577 : 1009 : = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8578 : 3757 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8579 : : {
8580 : 2748 : gimple *new_stmt;
8581 : 2748 : tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8582 : 2748 : if (masked_loop_p && !mask_by_cond_expr)
8583 : : {
8584 : : /* No conditional ifns have been defined for dot-product yet. */
8585 : 0 : gcc_assert (code != DOT_PROD_EXPR);
8586 : :
8587 : : /* Make sure that the reduction accumulator is vop[0]. */
8588 : 0 : if (reduc_index == 1)
8589 : : {
8590 : 0 : gcc_assert (commutative_binary_op_p (code, op.type));
8591 : 0 : std::swap (vop[0], vop[1]);
8592 : : }
8593 : 0 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8594 : 0 : vec_num * ncopies, vectype_in, i);
8595 : 0 : gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8596 : : vop[0], vop[1], vop[0]);
8597 : 0 : new_temp = make_ssa_name (vec_dest, call);
8598 : 0 : gimple_call_set_lhs (call, new_temp);
8599 : 0 : gimple_call_set_nothrow (call, true);
8600 : 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8601 : 0 : new_stmt = call;
8602 : : }
8603 : : else
8604 : : {
8605 : 2748 : if (op.num_ops >= 3)
8606 : 752 : vop[2] = vec_oprnds2[i];
8607 : :
8608 : 2748 : if (masked_loop_p && mask_by_cond_expr)
8609 : : {
8610 : 0 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8611 : 0 : vec_num * ncopies, vectype_in, i);
8612 : 0 : build_vect_cond_expr (code, vop, mask, gsi);
8613 : : }
8614 : :
8615 : 2748 : if (emulated_mixed_dot_prod)
8616 : 18 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8617 : : vec_dest, vop);
8618 : :
8619 : 3342 : else if (code.is_internal_fn () && !cond_fn_p)
8620 : 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8621 : : op.num_ops,
8622 : : vop[0], vop[1], vop[2]);
8623 : 3342 : else if (code.is_internal_fn () && cond_fn_p)
8624 : 612 : new_stmt = gimple_build_call_internal (internal_fn (code),
8625 : : op.num_ops,
8626 : : vop[0], vop[1], vop[2],
8627 : : vop[1]);
8628 : : else
8629 : 2118 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8630 : : vop[0], vop[1], vop[2]);
8631 : 2748 : new_temp = make_ssa_name (vec_dest, new_stmt);
8632 : 2748 : gimple_set_lhs (new_stmt, new_temp);
8633 : 2748 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8634 : : }
8635 : :
8636 : 2748 : if (slp_node)
8637 : 11 : slp_node->push_vec_def (new_stmt);
8638 : 2737 : else if (single_defuse_cycle
8639 : 2620 : && i < ncopies - 1)
8640 : : {
8641 : 1739 : if (reduc_index == 0)
8642 : 538 : vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8643 : 1201 : else if (reduc_index == 1)
8644 : 1195 : vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8645 : 6 : else if (reduc_index == 2)
8646 : 6 : vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8647 : : }
8648 : : else
8649 : 998 : STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8650 : : }
8651 : :
8652 : 1009 : if (!slp_node)
8653 : 998 : *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8654 : :
8655 : : return true;
8656 : 1650 : }
8657 : :
8658 : : /* Transform phase of a cycle PHI. */
8659 : :
8660 : : bool
8661 : 17675 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8662 : : stmt_vec_info stmt_info, gimple **vec_stmt,
8663 : : slp_tree slp_node, slp_instance slp_node_instance)
8664 : : {
8665 : 17675 : tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8666 : 17675 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8667 : 17675 : int i;
8668 : 17675 : int ncopies;
8669 : 17675 : int j;
8670 : 17675 : bool nested_cycle = false;
8671 : 17675 : int vec_num;
8672 : :
8673 : 17760 : if (nested_in_vect_loop_p (loop, stmt_info))
8674 : : {
8675 : : loop = loop->inner;
8676 : : nested_cycle = true;
8677 : : }
8678 : :
8679 : 17675 : stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8680 : 17675 : reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8681 : 17675 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8682 : 17675 : gcc_assert (reduc_info->is_reduc_info);
8683 : :
8684 : 17675 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8685 : 17675 : || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8686 : : /* Leave the scalar phi in place. */
8687 : : return true;
8688 : :
8689 : 17034 : tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8690 : : /* For a nested cycle we do not fill the above. */
8691 : 17034 : if (!vectype_in)
8692 : 366 : vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8693 : 366 : gcc_assert (vectype_in);
8694 : :
8695 : 17034 : if (slp_node)
8696 : : {
8697 : : /* The size vect_schedule_slp_instance computes is off for us. */
8698 : 1296 : vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8699 : 648 : * SLP_TREE_LANES (slp_node), vectype_in);
8700 : 648 : ncopies = 1;
8701 : : }
8702 : : else
8703 : : {
8704 : 16386 : vec_num = 1;
8705 : 16386 : ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8706 : : }
8707 : :
8708 : : /* Check whether we should use a single PHI node and accumulate
8709 : : vectors to one before the backedge. */
8710 : 17034 : if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8711 : 881 : ncopies = 1;
8712 : :
8713 : : /* Create the destination vector */
8714 : 17034 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8715 : 17034 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8716 : : vectype_out);
8717 : :
8718 : : /* Get the loop-entry arguments. */
8719 : 17034 : tree vec_initial_def = NULL_TREE;
8720 : 17034 : auto_vec<tree> vec_initial_defs;
8721 : 17034 : if (slp_node)
8722 : : {
8723 : 648 : vec_initial_defs.reserve (vec_num);
8724 : 648 : if (nested_cycle)
8725 : : {
8726 : 16 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8727 : 16 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8728 : : &vec_initial_defs);
8729 : : }
8730 : : else
8731 : : {
8732 : 632 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8733 : 632 : vec<tree> &initial_values = reduc_info->reduc_initial_values;
8734 : 632 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8735 : :
8736 : 632 : unsigned int num_phis = stmts.length ();
8737 : 632 : if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8738 : 124 : num_phis = 1;
8739 : 632 : initial_values.reserve (num_phis);
8740 : 1962 : for (unsigned int i = 0; i < num_phis; ++i)
8741 : : {
8742 : 1330 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8743 : 1330 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8744 : : }
8745 : 632 : if (vec_num == 1)
8746 : 525 : vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8747 : 632 : if (!initial_values.is_empty ())
8748 : : {
8749 : 632 : tree initial_value
8750 : 756 : = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8751 : 632 : code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8752 : 632 : tree neutral_op
8753 : 632 : = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8754 : : code, initial_value);
8755 : 1264 : get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8756 : : &vec_initial_defs, vec_num,
8757 : : stmts.length (), neutral_op);
8758 : : }
8759 : : }
8760 : : }
8761 : : else
8762 : : {
8763 : : /* Get at the scalar def before the loop, that defines the initial
8764 : : value of the reduction variable. */
8765 : 16386 : tree initial_def = vect_phi_initial_value (phi);
8766 : 16386 : reduc_info->reduc_initial_values.safe_push (initial_def);
8767 : : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8768 : : and we can't use zero for induc_val, use initial_def. Similarly
8769 : : for REDUC_MIN and initial_def larger than the base. */
8770 : 16386 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8771 : : {
8772 : 66 : tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8773 : 66 : if (TREE_CODE (initial_def) == INTEGER_CST
8774 : 64 : && !integer_zerop (induc_val)
8775 : 130 : && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8776 : 44 : && tree_int_cst_lt (initial_def, induc_val))
8777 : 61 : || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8778 : 20 : && tree_int_cst_lt (induc_val, initial_def))))
8779 : : {
8780 : 3 : induc_val = initial_def;
8781 : : /* Communicate we used the initial_def to epilouge
8782 : : generation. */
8783 : 3 : STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8784 : : }
8785 : 66 : vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8786 : : }
8787 : 16320 : else if (nested_cycle)
8788 : : {
8789 : : /* Do not use an adjustment def as that case is not supported
8790 : : correctly if ncopies is not one. */
8791 : 405 : vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8792 : : ncopies, initial_def,
8793 : : &vec_initial_defs);
8794 : : }
8795 : 15915 : else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8796 : 15909 : || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8797 : : /* Fill the initial vector with the initial scalar value. */
8798 : 69 : vec_initial_def
8799 : 69 : = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8800 : : initial_def, initial_def);
8801 : : else
8802 : : {
8803 : 15846 : if (ncopies == 1)
8804 : 15803 : vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8805 : 32232 : if (!reduc_info->reduc_initial_values.is_empty ())
8806 : : {
8807 : 15672 : initial_def = reduc_info->reduc_initial_values[0];
8808 : 15672 : code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8809 : 15672 : tree neutral_op
8810 : 15672 : = neutral_op_for_reduction (TREE_TYPE (initial_def),
8811 : : code, initial_def);
8812 : 15672 : gcc_assert (neutral_op);
8813 : : /* Try to simplify the vector initialization by applying an
8814 : : adjustment after the reduction has been performed. */
8815 : 15672 : if (!reduc_info->reused_accumulator
8816 : 11964 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8817 : 27581 : && !operand_equal_p (neutral_op, initial_def))
8818 : : {
8819 : 8344 : STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8820 : 8344 : = initial_def;
8821 : 8344 : initial_def = neutral_op;
8822 : : }
8823 : 15672 : vec_initial_def
8824 : 15672 : = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8825 : : initial_def, neutral_op);
8826 : : }
8827 : : }
8828 : : }
8829 : :
8830 : 17034 : if (vec_initial_def)
8831 : : {
8832 : 15807 : vec_initial_defs.create (ncopies);
8833 : 47476 : for (i = 0; i < ncopies; ++i)
8834 : 15862 : vec_initial_defs.quick_push (vec_initial_def);
8835 : : }
8836 : :
8837 : 17034 : if (auto *accumulator = reduc_info->reused_accumulator)
8838 : : {
8839 : 4018 : tree def = accumulator->reduc_input;
8840 : 4018 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8841 : : {
8842 : 4018 : unsigned int nreduc;
8843 : 8036 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8844 : 4018 : (TREE_TYPE (def)),
8845 : 4018 : TYPE_VECTOR_SUBPARTS (vectype_out),
8846 : : &nreduc);
8847 : 4018 : gcc_assert (res);
8848 : 4018 : gimple_seq stmts = NULL;
8849 : : /* Reduce the single vector to a smaller one. */
8850 : 4018 : if (nreduc != 1)
8851 : : {
8852 : : /* Perform the reduction in the appropriate type. */
8853 : 4018 : tree rvectype = vectype_out;
8854 : 4018 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8855 : 4018 : TREE_TYPE (TREE_TYPE (def))))
8856 : 844 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8857 : : TYPE_VECTOR_SUBPARTS
8858 : 1688 : (vectype_out));
8859 : 4018 : def = vect_create_partial_epilog (def, rvectype,
8860 : : STMT_VINFO_REDUC_CODE
8861 : : (reduc_info),
8862 : : &stmts);
8863 : : }
8864 : : /* The epilogue loop might use a different vector mode, like
8865 : : VNx2DI vs. V2DI. */
8866 : 4018 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8867 : : {
8868 : 0 : tree reduc_type = build_vector_type_for_mode
8869 : 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8870 : 0 : def = gimple_convert (&stmts, reduc_type, def);
8871 : : }
8872 : : /* Adjust the input so we pick up the partially reduced value
8873 : : for the skip edge in vect_create_epilog_for_reduction. */
8874 : 4018 : accumulator->reduc_input = def;
8875 : : /* And the reduction could be carried out using a different sign. */
8876 : 4018 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8877 : 844 : def = gimple_convert (&stmts, vectype_out, def);
8878 : 4018 : if (loop_vinfo->main_loop_edge)
8879 : : {
8880 : : /* While we'd like to insert on the edge this will split
8881 : : blocks and disturb bookkeeping, we also will eventually
8882 : : need this on the skip edge. Rely on sinking to
8883 : : fixup optimal placement and insert in the pred. */
8884 : 3844 : gimple_stmt_iterator gsi
8885 : 3844 : = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8886 : : /* Insert before a cond that eventually skips the
8887 : : epilogue. */
8888 : 3844 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8889 : 3831 : gsi_prev (&gsi);
8890 : 3844 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8891 : : }
8892 : : else
8893 : 174 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8894 : : stmts);
8895 : : }
8896 : 4018 : if (loop_vinfo->main_loop_edge)
8897 : 3844 : vec_initial_defs[0]
8898 : 3844 : = vect_get_main_loop_result (loop_vinfo, def,
8899 : 3844 : vec_initial_defs[0]);
8900 : : else
8901 : 174 : vec_initial_defs.safe_push (def);
8902 : : }
8903 : :
8904 : : /* Generate the reduction PHIs upfront. */
8905 : 34455 : for (i = 0; i < vec_num; i++)
8906 : : {
8907 : 17421 : tree vec_init_def = vec_initial_defs[i];
8908 : 34995 : for (j = 0; j < ncopies; j++)
8909 : : {
8910 : : /* Create the reduction-phi that defines the reduction
8911 : : operand. */
8912 : 17574 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
8913 : :
8914 : : /* Set the loop-entry arg of the reduction-phi. */
8915 : 17574 : if (j != 0 && nested_cycle)
8916 : 98 : vec_init_def = vec_initial_defs[j];
8917 : 17574 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8918 : : UNKNOWN_LOCATION);
8919 : :
8920 : : /* The loop-latch arg is set in epilogue processing. */
8921 : :
8922 : 17574 : if (slp_node)
8923 : 1035 : slp_node->push_vec_def (new_phi);
8924 : : else
8925 : : {
8926 : 16539 : if (j == 0)
8927 : 16386 : *vec_stmt = new_phi;
8928 : 16539 : STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8929 : : }
8930 : : }
8931 : : }
8932 : :
8933 : 17034 : return true;
8934 : 17034 : }
8935 : :
8936 : : /* Vectorizes LC PHIs. */
8937 : :
8938 : : bool
8939 : 46345 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
8940 : : stmt_vec_info stmt_info, gimple **vec_stmt,
8941 : : slp_tree slp_node)
8942 : : {
8943 : 46345 : if (!loop_vinfo
8944 : 46345 : || !is_a <gphi *> (stmt_info->stmt)
8945 : 47288 : || gimple_phi_num_args (stmt_info->stmt) != 1)
8946 : : return false;
8947 : :
8948 : 817 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8949 : 148 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8950 : : return false;
8951 : :
8952 : 817 : if (!vec_stmt) /* transformation not required. */
8953 : : {
8954 : : /* Deal with copies from externs or constants that disguise as
8955 : : loop-closed PHI nodes (PR97886). */
8956 : 480 : if (slp_node
8957 : 480 : && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8958 : : SLP_TREE_VECTYPE (slp_node)))
8959 : : {
8960 : 0 : if (dump_enabled_p ())
8961 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8962 : : "incompatible vector types for invariants\n");
8963 : 0 : return false;
8964 : : }
8965 : 480 : STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8966 : 480 : return true;
8967 : : }
8968 : :
8969 : 337 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8970 : 337 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8971 : 337 : basic_block bb = gimple_bb (stmt_info->stmt);
8972 : 337 : edge e = single_pred_edge (bb);
8973 : 337 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8974 : 337 : auto_vec<tree> vec_oprnds;
8975 : 337 : vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8976 : 317 : !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8977 : 337 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8978 : 1516 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
8979 : : {
8980 : : /* Create the vectorized LC PHI node. */
8981 : 421 : gphi *new_phi = create_phi_node (vec_dest, bb);
8982 : 421 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8983 : 421 : if (slp_node)
8984 : 70 : slp_node->push_vec_def (new_phi);
8985 : : else
8986 : 351 : STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8987 : : }
8988 : 337 : if (!slp_node)
8989 : 317 : *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8990 : :
8991 : 337 : return true;
8992 : 337 : }
8993 : :
8994 : : /* Vectorizes PHIs. */
8995 : :
8996 : : bool
8997 : 161504 : vectorizable_phi (vec_info *,
8998 : : stmt_vec_info stmt_info, gimple **vec_stmt,
8999 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9000 : : {
9001 : 161504 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9002 : : return false;
9003 : :
9004 : 80801 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9005 : : return false;
9006 : :
9007 : 80801 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9008 : :
9009 : 80801 : if (!vec_stmt) /* transformation not required. */
9010 : : {
9011 : : slp_tree child;
9012 : : unsigned i;
9013 : 191855 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9014 : 131142 : if (!child)
9015 : : {
9016 : 0 : if (dump_enabled_p ())
9017 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9018 : : "PHI node with unvectorized backedge def\n");
9019 : 0 : return false;
9020 : : }
9021 : 131142 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9022 : : {
9023 : 19 : if (dump_enabled_p ())
9024 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9025 : : "incompatible vector types for invariants\n");
9026 : 19 : return false;
9027 : : }
9028 : 131123 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9029 : 131123 : && !useless_type_conversion_p (vectype,
9030 : : SLP_TREE_VECTYPE (child)))
9031 : : {
9032 : : /* With bools we can have mask and non-mask precision vectors
9033 : : or different non-mask precisions. while pattern recog is
9034 : : supposed to guarantee consistency here bugs in it can cause
9035 : : mismatches (PR103489 and PR103800 for example).
9036 : : Deal with them here instead of ICEing later. */
9037 : 13 : if (dump_enabled_p ())
9038 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9039 : : "incompatible vector type setup from "
9040 : : "bool pattern detection\n");
9041 : 13 : return false;
9042 : : }
9043 : :
9044 : : /* For single-argument PHIs assume coalescing which means zero cost
9045 : : for the scalar and the vector PHIs. This avoids artificially
9046 : : favoring the vector path (but may pessimize it in some cases). */
9047 : 60713 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9048 : 51896 : record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9049 : : vector_stmt, stmt_info, vectype, 0, vect_body);
9050 : 60713 : STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9051 : 60713 : return true;
9052 : : }
9053 : :
9054 : 20056 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9055 : 20056 : basic_block bb = gimple_bb (stmt_info->stmt);
9056 : 20056 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9057 : 20056 : auto_vec<gphi *> new_phis;
9058 : 67755 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9059 : : {
9060 : 47699 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9061 : :
9062 : : /* Skip not yet vectorized defs. */
9063 : 48190 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9064 : 47699 : && SLP_TREE_VEC_DEFS (child).is_empty ())
9065 : 491 : continue;
9066 : :
9067 : 47208 : auto_vec<tree> vec_oprnds;
9068 : 47208 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9069 : 47208 : if (!new_phis.exists ())
9070 : : {
9071 : 20056 : new_phis.create (vec_oprnds.length ());
9072 : 83234 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
9073 : : {
9074 : : /* Create the vectorized LC PHI node. */
9075 : 21561 : new_phis.quick_push (create_phi_node (vec_dest, bb));
9076 : 21561 : slp_node->push_vec_def (new_phis[j]);
9077 : : }
9078 : : }
9079 : 47208 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9080 : 195040 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
9081 : 50312 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9082 : 47208 : }
9083 : : /* We should have at least one already vectorized child. */
9084 : 20056 : gcc_assert (new_phis.exists ());
9085 : :
9086 : 20056 : return true;
9087 : 20056 : }
9088 : :
9089 : : /* Vectorizes first order recurrences. An overview of the transformation
9090 : : is described below. Suppose we have the following loop.
9091 : :
9092 : : int t = 0;
9093 : : for (int i = 0; i < n; ++i)
9094 : : {
9095 : : b[i] = a[i] - t;
9096 : : t = a[i];
9097 : : }
9098 : :
9099 : : There is a first-order recurrence on 'a'. For this loop, the scalar IR
9100 : : looks (simplified) like:
9101 : :
9102 : : scalar.preheader:
9103 : : init = 0;
9104 : :
9105 : : scalar.body:
9106 : : i = PHI <0(scalar.preheader), i+1(scalar.body)>
9107 : : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9108 : : _1 = a[i]
9109 : : b[i] = _1 - _2
9110 : : if (i < n) goto scalar.body
9111 : :
9112 : : In this example, _2 is a recurrence because it's value depends on the
9113 : : previous iteration. We vectorize this as (VF = 4)
9114 : :
9115 : : vector.preheader:
9116 : : vect_init = vect_cst(..., ..., ..., 0)
9117 : :
9118 : : vector.body
9119 : : i = PHI <0(vector.preheader), i+4(vector.body)>
9120 : : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9121 : : vect_2 = a[i, i+1, i+2, i+3];
9122 : : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9123 : : b[i, i+1, i+2, i+3] = vect_2 - vect_3
9124 : : if (..) goto vector.body
9125 : :
9126 : : In this function, vectorizable_recurr, we code generate both the
9127 : : vector PHI node and the permute since those together compute the
9128 : : vectorized value of the scalar PHI. We do not yet have the
9129 : : backedge value to fill in there nor into the vec_perm. Those
9130 : : are filled in maybe_set_vectorized_backedge_value and
9131 : : vect_schedule_scc.
9132 : :
9133 : : TODO: Since the scalar loop does not have a use of the recurrence
9134 : : outside of the loop the natural way to implement peeling via
9135 : : vectorizing the live value doesn't work. For now peeling of loops
9136 : : with a recurrence is not implemented. For SLP the supported cases
9137 : : are restricted to those requiring a single vector recurrence PHI. */
9138 : :
9139 : : bool
9140 : 45724 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9141 : : gimple **vec_stmt, slp_tree slp_node,
9142 : : stmt_vector_for_cost *cost_vec)
9143 : : {
9144 : 45724 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9145 : : return false;
9146 : :
9147 : 322 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
9148 : :
9149 : : /* So far we only support first-order recurrence auto-vectorization. */
9150 : 322 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9151 : : return false;
9152 : :
9153 : 208 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9154 : 208 : unsigned ncopies;
9155 : 208 : if (slp_node)
9156 : 24 : ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9157 : : else
9158 : 184 : ncopies = vect_get_num_copies (loop_vinfo, vectype);
9159 : 208 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9160 : 208 : unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9161 : : /* We need to be able to make progress with a single vector. */
9162 : 208 : if (maybe_gt (dist * 2, nunits))
9163 : : {
9164 : 23 : if (dump_enabled_p ())
9165 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9166 : : "first order recurrence exceeds half of "
9167 : : "a vector\n");
9168 : 23 : return false;
9169 : : }
9170 : :
9171 : : /* First-order recurrence autovectorization needs to handle permutation
9172 : : with indices = [nunits-1, nunits, nunits+1, ...]. */
9173 : 185 : vec_perm_builder sel (nunits, 1, 3);
9174 : 740 : for (int i = 0; i < 3; ++i)
9175 : 555 : sel.quick_push (nunits - dist + i);
9176 : 185 : vec_perm_indices indices (sel, 2, nunits);
9177 : :
9178 : 185 : if (!vec_stmt) /* transformation not required. */
9179 : : {
9180 : 151 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9181 : : indices))
9182 : : return false;
9183 : :
9184 : 111 : if (slp_node)
9185 : : {
9186 : : /* We eventually need to set a vector type on invariant
9187 : : arguments. */
9188 : : unsigned j;
9189 : : slp_tree child;
9190 : 36 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9191 : 24 : if (!vect_maybe_update_slp_op_vectype
9192 : 24 : (child, SLP_TREE_VECTYPE (slp_node)))
9193 : : {
9194 : 0 : if (dump_enabled_p ())
9195 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9196 : : "incompatible vector types for "
9197 : : "invariants\n");
9198 : 0 : return false;
9199 : : }
9200 : : }
9201 : :
9202 : : /* Verify we have set up compatible types. */
9203 : 111 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9204 : 111 : tree latch_vectype = NULL_TREE;
9205 : 111 : if (slp_node)
9206 : : {
9207 : 12 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
9208 : 12 : latch_vectype = SLP_TREE_VECTYPE (latch_def);
9209 : : }
9210 : : else
9211 : : {
9212 : 99 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, le);
9213 : 99 : if (TREE_CODE (latch_def) == SSA_NAME)
9214 : : {
9215 : 99 : stmt_vec_info latch_def_info = loop_vinfo->lookup_def (latch_def);
9216 : 99 : latch_def_info = vect_stmt_to_vectorize (latch_def_info);
9217 : 99 : latch_vectype = STMT_VINFO_VECTYPE (latch_def_info);
9218 : : }
9219 : : }
9220 : 111 : if (!types_compatible_p (latch_vectype, vectype))
9221 : : return false;
9222 : :
9223 : : /* The recurrence costs the initialization vector and one permute
9224 : : for each copy. */
9225 : 103 : unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9226 : : stmt_info, 0, vect_prologue);
9227 : 103 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9228 : : stmt_info, 0, vect_body);
9229 : 103 : if (dump_enabled_p ())
9230 : 33 : dump_printf_loc (MSG_NOTE, vect_location,
9231 : : "vectorizable_recurr: inside_cost = %d, "
9232 : : "prologue_cost = %d .\n", inside_cost,
9233 : : prologue_cost);
9234 : :
9235 : 103 : STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9236 : 103 : return true;
9237 : : }
9238 : :
9239 : 34 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9240 : 34 : basic_block bb = gimple_bb (phi);
9241 : 34 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9242 : 34 : if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9243 : : {
9244 : 4 : gimple_seq stmts = NULL;
9245 : 4 : preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9246 : 4 : gsi_insert_seq_on_edge_immediate (pe, stmts);
9247 : : }
9248 : 34 : tree vec_init = build_vector_from_val (vectype, preheader);
9249 : 34 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9250 : :
9251 : : /* Create the vectorized first-order PHI node. */
9252 : 34 : tree vec_dest = vect_get_new_vect_var (vectype,
9253 : : vect_simple_var, "vec_recur_");
9254 : 34 : gphi *new_phi = create_phi_node (vec_dest, bb);
9255 : 34 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9256 : :
9257 : : /* Insert shuffles the first-order recurrence autovectorization.
9258 : : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9259 : 34 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
9260 : :
9261 : : /* Insert the required permute after the latch definition. The
9262 : : second and later operands are tentative and will be updated when we have
9263 : : vectorized the latch definition. */
9264 : 34 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9265 : 34 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9266 : 34 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9267 : 34 : gsi_next (&gsi2);
9268 : :
9269 : 90 : for (unsigned i = 0; i < ncopies; ++i)
9270 : : {
9271 : 56 : vec_dest = make_ssa_name (vectype);
9272 : 56 : gassign *vperm
9273 : 90 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9274 : 34 : i == 0 ? gimple_phi_result (new_phi) : NULL,
9275 : : NULL, perm);
9276 : 56 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9277 : :
9278 : 56 : if (slp_node)
9279 : 18 : slp_node->push_vec_def (vperm);
9280 : : else
9281 : 38 : STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9282 : : }
9283 : :
9284 : 34 : if (!slp_node)
9285 : 22 : *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9286 : : return true;
9287 : 185 : }
9288 : :
9289 : : /* Return true if VECTYPE represents a vector that requires lowering
9290 : : by the vector lowering pass. */
9291 : :
9292 : : bool
9293 : 528229 : vect_emulated_vector_p (tree vectype)
9294 : : {
9295 : 1056458 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9296 : 531240 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9297 : 2993 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9298 : : }
9299 : :
9300 : : /* Return true if we can emulate CODE on an integer mode representation
9301 : : of a vector. */
9302 : :
9303 : : bool
9304 : 12133 : vect_can_vectorize_without_simd_p (tree_code code)
9305 : : {
9306 : 12133 : switch (code)
9307 : : {
9308 : : case PLUS_EXPR:
9309 : : case MINUS_EXPR:
9310 : : case NEGATE_EXPR:
9311 : : case BIT_AND_EXPR:
9312 : : case BIT_IOR_EXPR:
9313 : : case BIT_XOR_EXPR:
9314 : : case BIT_NOT_EXPR:
9315 : : return true;
9316 : :
9317 : 8031 : default:
9318 : 8031 : return false;
9319 : : }
9320 : : }
9321 : :
9322 : : /* Likewise, but taking a code_helper. */
9323 : :
9324 : : bool
9325 : 1304 : vect_can_vectorize_without_simd_p (code_helper code)
9326 : : {
9327 : 1304 : return (code.is_tree_code ()
9328 : 1304 : && vect_can_vectorize_without_simd_p (tree_code (code)));
9329 : : }
9330 : :
9331 : : /* Create vector init for vectorized iv. */
9332 : : static tree
9333 : 793 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9334 : : tree step_expr, poly_uint64 nunits,
9335 : : tree vectype,
9336 : : enum vect_induction_op_type induction_type)
9337 : : {
9338 : 793 : unsigned HOST_WIDE_INT const_nunits;
9339 : 793 : tree vec_shift, vec_init, new_name;
9340 : 793 : unsigned i;
9341 : 793 : tree itype = TREE_TYPE (vectype);
9342 : :
9343 : : /* iv_loop is the loop to be vectorized. Create:
9344 : : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9345 : 793 : new_name = gimple_convert (stmts, itype, init_expr);
9346 : 793 : switch (induction_type)
9347 : : {
9348 : 18 : case vect_step_op_shr:
9349 : 18 : case vect_step_op_shl:
9350 : : /* Build the Initial value from shift_expr. */
9351 : 18 : vec_init = gimple_build_vector_from_val (stmts,
9352 : : vectype,
9353 : : new_name);
9354 : 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9355 : : build_zero_cst (itype), step_expr);
9356 : 18 : vec_init = gimple_build (stmts,
9357 : : (induction_type == vect_step_op_shr
9358 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
9359 : : vectype, vec_init, vec_shift);
9360 : 18 : break;
9361 : :
9362 : 699 : case vect_step_op_neg:
9363 : 699 : {
9364 : 699 : vec_init = gimple_build_vector_from_val (stmts,
9365 : : vectype,
9366 : : new_name);
9367 : 699 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9368 : : vectype, vec_init);
9369 : : /* The encoding has 2 interleaved stepped patterns. */
9370 : 699 : vec_perm_builder sel (nunits, 2, 3);
9371 : 699 : sel.quick_grow (6);
9372 : 3495 : for (i = 0; i < 3; i++)
9373 : : {
9374 : 2097 : sel[2 * i] = i;
9375 : 2097 : sel[2 * i + 1] = i + nunits;
9376 : : }
9377 : 699 : vec_perm_indices indices (sel, 2, nunits);
9378 : : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9379 : : fail when vec_init is const vector. In that situation vec_perm is not
9380 : : really needed. */
9381 : 699 : tree perm_mask_even
9382 : 699 : = vect_gen_perm_mask_any (vectype, indices);
9383 : 699 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9384 : : vectype,
9385 : : vec_init, vec_neg,
9386 : : perm_mask_even);
9387 : 699 : }
9388 : 699 : break;
9389 : :
9390 : 76 : case vect_step_op_mul:
9391 : 76 : {
9392 : : /* Use unsigned mult to avoid UD integer overflow. */
9393 : 76 : gcc_assert (nunits.is_constant (&const_nunits));
9394 : 76 : tree utype = unsigned_type_for (itype);
9395 : 76 : tree uvectype = build_vector_type (utype,
9396 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
9397 : 76 : new_name = gimple_convert (stmts, utype, new_name);
9398 : 76 : vec_init = gimple_build_vector_from_val (stmts,
9399 : : uvectype,
9400 : : new_name);
9401 : 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
9402 : 76 : tree elt_step = build_one_cst (utype);
9403 : :
9404 : 76 : elts.quick_push (elt_step);
9405 : 660 : for (i = 1; i < const_nunits; i++)
9406 : : {
9407 : : /* Create: new_name_i = new_name + step_expr. */
9408 : 508 : elt_step = gimple_build (stmts, MULT_EXPR,
9409 : : utype, elt_step, step_expr);
9410 : 508 : elts.quick_push (elt_step);
9411 : : }
9412 : : /* Create a vector from [new_name_0, new_name_1, ...,
9413 : : new_name_nunits-1]. */
9414 : 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
9415 : 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9416 : : vec_init, vec_mul);
9417 : 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
9418 : 76 : }
9419 : 76 : break;
9420 : :
9421 : 0 : default:
9422 : 0 : gcc_unreachable ();
9423 : : }
9424 : :
9425 : 793 : return vec_init;
9426 : : }
9427 : :
9428 : : /* Peel init_expr by skip_niter for induction_type. */
9429 : : tree
9430 : 86 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9431 : : tree skip_niters, tree step_expr,
9432 : : enum vect_induction_op_type induction_type)
9433 : : {
9434 : 86 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9435 : 86 : tree type = TREE_TYPE (init_expr);
9436 : 86 : unsigned prec = TYPE_PRECISION (type);
9437 : 86 : switch (induction_type)
9438 : : {
9439 : 0 : case vect_step_op_neg:
9440 : 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
9441 : 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9442 : : /* else no change. */
9443 : : break;
9444 : :
9445 : 12 : case vect_step_op_shr:
9446 : 12 : case vect_step_op_shl:
9447 : 12 : skip_niters = gimple_convert (stmts, type, skip_niters);
9448 : 12 : step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9449 : : /* When shift mount >= precision, need to avoid UD.
9450 : : In the original loop, there's no UD, and according to semantic,
9451 : : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9452 : 12 : if (!tree_fits_uhwi_p (step_expr)
9453 : 12 : || tree_to_uhwi (step_expr) >= prec)
9454 : : {
9455 : 6 : if (induction_type == vect_step_op_shl
9456 : 6 : || TYPE_UNSIGNED (type))
9457 : 4 : init_expr = build_zero_cst (type);
9458 : : else
9459 : 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9460 : : init_expr,
9461 : 2 : wide_int_to_tree (type, prec - 1));
9462 : : }
9463 : : else
9464 : 8 : init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9465 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
9466 : : type, init_expr, step_expr);
9467 : : break;
9468 : :
9469 : 74 : case vect_step_op_mul:
9470 : 74 : {
9471 : 74 : tree utype = unsigned_type_for (type);
9472 : 74 : init_expr = gimple_convert (stmts, utype, init_expr);
9473 : 74 : wide_int skipn = wi::to_wide (skip_niters);
9474 : 74 : wide_int begin = wi::to_wide (step_expr);
9475 : 74 : auto_mpz base, exp, mod, res;
9476 : 74 : wi::to_mpz (begin, base, TYPE_SIGN (type));
9477 : 74 : wi::to_mpz (skipn, exp, UNSIGNED);
9478 : 74 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9479 : 74 : mpz_powm (res, base, exp, mod);
9480 : 74 : begin = wi::from_mpz (utype, res, true);
9481 : 74 : tree mult_expr = wide_int_to_tree (utype, begin);
9482 : 74 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
9483 : : init_expr, mult_expr);
9484 : 74 : init_expr = gimple_convert (stmts, type, init_expr);
9485 : 74 : }
9486 : 74 : break;
9487 : :
9488 : 0 : default:
9489 : 0 : gcc_unreachable ();
9490 : : }
9491 : :
9492 : 86 : return init_expr;
9493 : : }
9494 : :
9495 : : /* Create vector step for vectorized iv. */
9496 : : static tree
9497 : 1027 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9498 : : poly_uint64 vf,
9499 : : enum vect_induction_op_type induction_type)
9500 : : {
9501 : 1027 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9502 : 1027 : tree new_name = NULL;
9503 : : /* Step should be pow (step, vf) for mult induction. */
9504 : 1027 : if (induction_type == vect_step_op_mul)
9505 : : {
9506 : 76 : gcc_assert (vf.is_constant ());
9507 : 76 : wide_int begin = wi::to_wide (step_expr);
9508 : :
9509 : 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9510 : 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
9511 : :
9512 : 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9513 : 76 : }
9514 : 951 : else if (induction_type == vect_step_op_neg)
9515 : : /* Do nothing. */
9516 : : ;
9517 : : else
9518 : 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9519 : : expr, step_expr);
9520 : 1027 : return new_name;
9521 : : }
9522 : :
9523 : : static tree
9524 : 1027 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9525 : : stmt_vec_info stmt_info,
9526 : : tree new_name, tree vectype,
9527 : : enum vect_induction_op_type induction_type)
9528 : : {
9529 : : /* No step is needed for neg induction. */
9530 : 1027 : if (induction_type == vect_step_op_neg)
9531 : : return NULL;
9532 : :
9533 : 94 : tree t = unshare_expr (new_name);
9534 : 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
9535 : : || TREE_CODE (new_name) == SSA_NAME);
9536 : 94 : tree new_vec = build_vector_from_val (vectype, t);
9537 : 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9538 : : new_vec, vectype, NULL);
9539 : 94 : return vec_step;
9540 : : }
9541 : :
9542 : : /* Update vectorized iv with vect_step, induc_def is init. */
9543 : : static tree
9544 : 1209 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9545 : : tree induc_def, tree vec_step,
9546 : : enum vect_induction_op_type induction_type)
9547 : : {
9548 : 1209 : tree vec_def = induc_def;
9549 : 1209 : switch (induction_type)
9550 : : {
9551 : 76 : case vect_step_op_mul:
9552 : 76 : {
9553 : : /* Use unsigned mult to avoid UD integer overflow. */
9554 : 76 : tree uvectype
9555 : 76 : = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9556 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
9557 : 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
9558 : 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
9559 : 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9560 : : vec_def, vec_step);
9561 : 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
9562 : : }
9563 : 76 : break;
9564 : :
9565 : 12 : case vect_step_op_shr:
9566 : 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9567 : : vec_def, vec_step);
9568 : 12 : break;
9569 : :
9570 : 6 : case vect_step_op_shl:
9571 : 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9572 : : vec_def, vec_step);
9573 : 6 : break;
9574 : : case vect_step_op_neg:
9575 : : vec_def = induc_def;
9576 : : /* Do nothing. */
9577 : : break;
9578 : 0 : default:
9579 : 0 : gcc_unreachable ();
9580 : : }
9581 : :
9582 : 1209 : return vec_def;
9583 : :
9584 : : }
9585 : :
9586 : : /* Function vectorizable_induction
9587 : :
9588 : : Check if STMT_INFO performs an nonlinear induction computation that can be
9589 : : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9590 : : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9591 : : basic block.
9592 : : Return true if STMT_INFO is vectorizable in this way. */
9593 : :
9594 : : static bool
9595 : 5120 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9596 : : stmt_vec_info stmt_info,
9597 : : gimple **vec_stmt, slp_tree slp_node,
9598 : : stmt_vector_for_cost *cost_vec)
9599 : : {
9600 : 5120 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9601 : 5120 : unsigned ncopies;
9602 : 5120 : bool nested_in_vect_loop = false;
9603 : 5120 : class loop *iv_loop;
9604 : 5120 : tree vec_def;
9605 : 5120 : edge pe = loop_preheader_edge (loop);
9606 : 5120 : basic_block new_bb;
9607 : 5120 : tree vec_init, vec_step;
9608 : 5120 : tree new_name;
9609 : 5120 : gimple *new_stmt;
9610 : 5120 : gphi *induction_phi;
9611 : 5120 : tree induc_def, vec_dest;
9612 : 5120 : tree init_expr, step_expr;
9613 : 5120 : tree niters_skip;
9614 : 5120 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9615 : 5120 : unsigned i;
9616 : 5120 : gimple_stmt_iterator si;
9617 : :
9618 : 5120 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9619 : :
9620 : 5120 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9621 : 5120 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9622 : 5120 : enum vect_induction_op_type induction_type
9623 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9624 : :
9625 : 5120 : gcc_assert (induction_type > vect_step_op_add);
9626 : :
9627 : 5120 : if (slp_node)
9628 : : ncopies = 1;
9629 : : else
9630 : 5120 : ncopies = vect_get_num_copies (loop_vinfo, vectype);
9631 : 5120 : gcc_assert (ncopies >= 1);
9632 : :
9633 : : /* FORNOW. Only handle nonlinear induction in the same loop. */
9634 : 5120 : if (nested_in_vect_loop_p (loop, stmt_info))
9635 : : {
9636 : 0 : if (dump_enabled_p ())
9637 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9638 : : "nonlinear induction in nested loop.\n");
9639 : 0 : return false;
9640 : : }
9641 : :
9642 : 5120 : iv_loop = loop;
9643 : 5120 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9644 : :
9645 : : /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9646 : : update for each iv and a permutation to generate wanted vector iv. */
9647 : 5120 : if (slp_node)
9648 : : {
9649 : 0 : if (dump_enabled_p ())
9650 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9651 : : "SLP induction not supported for nonlinear"
9652 : : " induction.\n");
9653 : 0 : return false;
9654 : : }
9655 : :
9656 : 5120 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9657 : : {
9658 : 0 : if (dump_enabled_p ())
9659 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9660 : : "floating point nonlinear induction vectorization"
9661 : : " not supported.\n");
9662 : 0 : return false;
9663 : : }
9664 : :
9665 : 5120 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9666 : 5120 : init_expr = vect_phi_initial_value (phi);
9667 : 5120 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9668 : : && TREE_CODE (step_expr) == INTEGER_CST);
9669 : : /* step_expr should be aligned with init_expr,
9670 : : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9671 : 5120 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9672 : :
9673 : 5120 : if (TREE_CODE (init_expr) == INTEGER_CST)
9674 : 2197 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9675 : 2923 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9676 : : {
9677 : : /* INIT_EXPR could be a bit_field, bail out for such case. */
9678 : 2 : if (dump_enabled_p ())
9679 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9680 : : "nonlinear induction vectorization failed:"
9681 : : " component type of vectype is not a nop conversion"
9682 : : " from type of init_expr.\n");
9683 : 2 : return false;
9684 : : }
9685 : :
9686 : 5118 : switch (induction_type)
9687 : : {
9688 : 2113 : case vect_step_op_neg:
9689 : 2113 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9690 : : return false;
9691 : 2111 : if (TREE_CODE (init_expr) != INTEGER_CST
9692 : 179 : && TREE_CODE (init_expr) != REAL_CST)
9693 : : {
9694 : : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9695 : 179 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9696 : 0 : return false;
9697 : :
9698 : : /* The encoding has 2 interleaved stepped patterns. */
9699 : 179 : vec_perm_builder sel (nunits, 2, 3);
9700 : 179 : machine_mode mode = TYPE_MODE (vectype);
9701 : 179 : sel.quick_grow (6);
9702 : 895 : for (i = 0; i < 3; i++)
9703 : : {
9704 : 537 : sel[i * 2] = i;
9705 : 537 : sel[i * 2 + 1] = i + nunits;
9706 : : }
9707 : 179 : vec_perm_indices indices (sel, 2, nunits);
9708 : 179 : if (!can_vec_perm_const_p (mode, mode, indices))
9709 : 0 : return false;
9710 : 179 : }
9711 : : break;
9712 : :
9713 : 568 : case vect_step_op_mul:
9714 : 568 : {
9715 : : /* Check for backend support of MULT_EXPR. */
9716 : 568 : if (!directly_supported_p (MULT_EXPR, vectype))
9717 : : return false;
9718 : :
9719 : : /* ?? How to construct vector step for variable number vector.
9720 : : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9721 : : if (!vf.is_constant ())
9722 : : return false;
9723 : : }
9724 : : break;
9725 : :
9726 : 2383 : case vect_step_op_shr:
9727 : : /* Check for backend support of RSHIFT_EXPR. */
9728 : 2383 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9729 : : return false;
9730 : :
9731 : : /* Don't shift more than type precision to avoid UD. */
9732 : 29 : if (!tree_fits_uhwi_p (step_expr)
9733 : 29 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9734 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9735 : : return false;
9736 : : break;
9737 : :
9738 : 54 : case vect_step_op_shl:
9739 : : /* Check for backend support of RSHIFT_EXPR. */
9740 : 54 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9741 : : return false;
9742 : :
9743 : : /* Don't shift more than type precision to avoid UD. */
9744 : 14 : if (!tree_fits_uhwi_p (step_expr)
9745 : 14 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9746 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9747 : : return false;
9748 : :
9749 : : break;
9750 : :
9751 : 0 : default:
9752 : 0 : gcc_unreachable ();
9753 : : }
9754 : :
9755 : 2642 : if (!vec_stmt) /* transformation not required. */
9756 : : {
9757 : 1849 : unsigned inside_cost = 0, prologue_cost = 0;
9758 : : /* loop cost for vec_loop. Neg induction doesn't have any
9759 : : inside_cost. */
9760 : 1849 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9761 : : stmt_info, 0, vect_body);
9762 : :
9763 : : /* loop cost for vec_loop. Neg induction doesn't have any
9764 : : inside_cost. */
9765 : 1849 : if (induction_type == vect_step_op_neg)
9766 : 1412 : inside_cost = 0;
9767 : :
9768 : : /* prologue cost for vec_init and vec_step. */
9769 : 1849 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9770 : : stmt_info, 0, vect_prologue);
9771 : :
9772 : 1849 : if (dump_enabled_p ())
9773 : 61 : dump_printf_loc (MSG_NOTE, vect_location,
9774 : : "vect_model_induction_cost: inside_cost = %d, "
9775 : : "prologue_cost = %d. \n", inside_cost,
9776 : : prologue_cost);
9777 : :
9778 : 1849 : STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9779 : 1849 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9780 : 1849 : return true;
9781 : : }
9782 : :
9783 : : /* Transform. */
9784 : :
9785 : : /* Compute a vector variable, initialized with the first VF values of
9786 : : the induction variable. E.g., for an iv with IV_PHI='X' and
9787 : : evolution S, for a vector of 4 units, we want to compute:
9788 : : [X, X + S, X + 2*S, X + 3*S]. */
9789 : :
9790 : 793 : if (dump_enabled_p ())
9791 : 32 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9792 : :
9793 : 793 : pe = loop_preheader_edge (iv_loop);
9794 : : /* Find the first insertion point in the BB. */
9795 : 793 : basic_block bb = gimple_bb (phi);
9796 : 793 : si = gsi_after_labels (bb);
9797 : :
9798 : 793 : gimple_seq stmts = NULL;
9799 : :
9800 : 793 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9801 : : /* If we are using the loop mask to "peel" for alignment then we need
9802 : : to adjust the start value here. */
9803 : 793 : if (niters_skip != NULL_TREE)
9804 : 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9805 : : step_expr, induction_type);
9806 : :
9807 : 793 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9808 : : step_expr, nunits, vectype,
9809 : : induction_type);
9810 : 793 : if (stmts)
9811 : : {
9812 : 148 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9813 : 148 : gcc_assert (!new_bb);
9814 : : }
9815 : :
9816 : 793 : stmts = NULL;
9817 : 793 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9818 : : vf, induction_type);
9819 : 793 : if (stmts)
9820 : : {
9821 : 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9822 : 0 : gcc_assert (!new_bb);
9823 : : }
9824 : :
9825 : 793 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9826 : : new_name, vectype,
9827 : : induction_type);
9828 : : /* Create the following def-use cycle:
9829 : : loop prolog:
9830 : : vec_init = ...
9831 : : vec_step = ...
9832 : : loop:
9833 : : vec_iv = PHI <vec_init, vec_loop>
9834 : : ...
9835 : : STMT
9836 : : ...
9837 : : vec_loop = vec_iv + vec_step; */
9838 : :
9839 : : /* Create the induction-phi that defines the induction-operand. */
9840 : 793 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9841 : 793 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9842 : 793 : induc_def = PHI_RESULT (induction_phi);
9843 : :
9844 : : /* Create the iv update inside the loop. */
9845 : 793 : stmts = NULL;
9846 : 793 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9847 : : induc_def, vec_step,
9848 : : induction_type);
9849 : :
9850 : 793 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9851 : 793 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9852 : :
9853 : : /* Set the arguments of the phi node: */
9854 : 793 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9855 : 793 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9856 : : UNKNOWN_LOCATION);
9857 : :
9858 : 793 : STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9859 : 793 : *vec_stmt = induction_phi;
9860 : :
9861 : : /* In case that vectorization factor (VF) is bigger than the number
9862 : : of elements that we can fit in a vectype (nunits), we have to generate
9863 : : more than one vector stmt - i.e - we need to "unroll" the
9864 : : vector stmt by a factor VF/nunits. For more details see documentation
9865 : : in vectorizable_operation. */
9866 : :
9867 : 793 : if (ncopies > 1)
9868 : : {
9869 : 234 : stmts = NULL;
9870 : : /* FORNOW. This restriction should be relaxed. */
9871 : 234 : gcc_assert (!nested_in_vect_loop);
9872 : :
9873 : 234 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9874 : : nunits, induction_type);
9875 : :
9876 : 234 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9877 : : new_name, vectype,
9878 : : induction_type);
9879 : 234 : vec_def = induc_def;
9880 : 884 : for (i = 1; i < ncopies; i++)
9881 : : {
9882 : : /* vec_i = vec_prev + vec_step. */
9883 : 416 : stmts = NULL;
9884 : 416 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9885 : : vec_def, vec_step,
9886 : : induction_type);
9887 : 416 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9888 : 416 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9889 : 416 : STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9890 : : }
9891 : : }
9892 : :
9893 : 793 : if (dump_enabled_p ())
9894 : 64 : dump_printf_loc (MSG_NOTE, vect_location,
9895 : : "transform induction: created def-use cycle: %G%G",
9896 : 32 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9897 : :
9898 : : return true;
9899 : : }
9900 : :
9901 : : /* Function vectorizable_induction
9902 : :
9903 : : Check if STMT_INFO performs an induction computation that can be vectorized.
9904 : : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9905 : : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9906 : : Return true if STMT_INFO is vectorizable in this way. */
9907 : :
9908 : : bool
9909 : 201163 : vectorizable_induction (loop_vec_info loop_vinfo,
9910 : : stmt_vec_info stmt_info,
9911 : : gimple **vec_stmt, slp_tree slp_node,
9912 : : stmt_vector_for_cost *cost_vec)
9913 : : {
9914 : 201163 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9915 : 201163 : unsigned ncopies;
9916 : 201163 : bool nested_in_vect_loop = false;
9917 : 201163 : class loop *iv_loop;
9918 : 201163 : tree vec_def;
9919 : 201163 : edge pe = loop_preheader_edge (loop);
9920 : 201163 : basic_block new_bb;
9921 : 201163 : tree new_vec, vec_init, vec_step, t;
9922 : 201163 : tree new_name;
9923 : 201163 : gimple *new_stmt;
9924 : 201163 : gphi *induction_phi;
9925 : 201163 : tree induc_def, vec_dest;
9926 : 201163 : tree init_expr, step_expr;
9927 : 201163 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9928 : 201163 : unsigned i;
9929 : 201163 : tree expr;
9930 : 201163 : gimple_stmt_iterator si;
9931 : 201163 : enum vect_induction_op_type induction_type
9932 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9933 : :
9934 : 209236 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9935 : 97820 : if (!phi)
9936 : : return false;
9937 : :
9938 : 97820 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
9939 : : return false;
9940 : :
9941 : : /* Make sure it was recognized as induction computation. */
9942 : 97820 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9943 : : return false;
9944 : :
9945 : : /* Handle nonlinear induction in a separate place. */
9946 : 97694 : if (induction_type != vect_step_op_add)
9947 : 5120 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9948 : 5120 : vec_stmt, slp_node, cost_vec);
9949 : :
9950 : 92574 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9951 : 92574 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9952 : :
9953 : 92574 : if (slp_node)
9954 : : ncopies = 1;
9955 : : else
9956 : 91829 : ncopies = vect_get_num_copies (loop_vinfo, vectype);
9957 : 91829 : gcc_assert (ncopies >= 1);
9958 : :
9959 : : /* FORNOW. These restrictions should be relaxed. */
9960 : 92574 : if (nested_in_vect_loop_p (loop, stmt_info))
9961 : : {
9962 : 458 : imm_use_iterator imm_iter;
9963 : 458 : use_operand_p use_p;
9964 : 458 : gimple *exit_phi;
9965 : 458 : edge latch_e;
9966 : 458 : tree loop_arg;
9967 : :
9968 : 458 : if (ncopies > 1)
9969 : : {
9970 : 82 : if (dump_enabled_p ())
9971 : 3 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9972 : : "multiple types in nested loop.\n");
9973 : 84 : return false;
9974 : : }
9975 : :
9976 : 376 : exit_phi = NULL;
9977 : 376 : latch_e = loop_latch_edge (loop->inner);
9978 : 376 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9979 : 762 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9980 : : {
9981 : 390 : gimple *use_stmt = USE_STMT (use_p);
9982 : 390 : if (is_gimple_debug (use_stmt))
9983 : 2 : continue;
9984 : :
9985 : 388 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9986 : : {
9987 : : exit_phi = use_stmt;
9988 : : break;
9989 : : }
9990 : : }
9991 : 376 : if (exit_phi)
9992 : : {
9993 : 4 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9994 : 4 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9995 : 2 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9996 : : {
9997 : 2 : if (dump_enabled_p ())
9998 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9999 : : "inner-loop induction only used outside "
10000 : : "of the outer vectorized loop.\n");
10001 : 2 : return false;
10002 : : }
10003 : : }
10004 : :
10005 : 374 : nested_in_vect_loop = true;
10006 : 374 : iv_loop = loop->inner;
10007 : : }
10008 : : else
10009 : : iv_loop = loop;
10010 : 92490 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10011 : :
10012 : 92490 : if (slp_node && !nunits.is_constant ())
10013 : : {
10014 : : /* The current SLP code creates the step value element-by-element. */
10015 : : if (dump_enabled_p ())
10016 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10017 : : "SLP induction not supported for variable-length"
10018 : : " vectors.\n");
10019 : : return false;
10020 : : }
10021 : :
10022 : 92490 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10023 : : {
10024 : 6 : if (dump_enabled_p ())
10025 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10026 : : "floating point induction vectorization disabled\n");
10027 : 6 : return false;
10028 : : }
10029 : :
10030 : 92484 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10031 : 92484 : gcc_assert (step_expr != NULL_TREE);
10032 : 184932 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10033 : 184853 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10034 : : {
10035 : 6 : if (dump_enabled_p ())
10036 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10037 : : "bit-precision induction vectorization not "
10038 : : "supported.\n");
10039 : 6 : return false;
10040 : : }
10041 : 92478 : tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10042 : :
10043 : : /* Check for backend support of PLUS/MINUS_EXPR. */
10044 : 92478 : if (!directly_supported_p (PLUS_EXPR, step_vectype)
10045 : 92478 : || !directly_supported_p (MINUS_EXPR, step_vectype))
10046 : 7851 : return false;
10047 : :
10048 : 84627 : if (!vec_stmt) /* transformation not required. */
10049 : : {
10050 : 71959 : unsigned inside_cost = 0, prologue_cost = 0;
10051 : 71959 : if (slp_node)
10052 : : {
10053 : : /* We eventually need to set a vector type on invariant
10054 : : arguments. */
10055 : : unsigned j;
10056 : : slp_tree child;
10057 : 1548 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10058 : 1032 : if (!vect_maybe_update_slp_op_vectype
10059 : 1032 : (child, SLP_TREE_VECTYPE (slp_node)))
10060 : : {
10061 : 0 : if (dump_enabled_p ())
10062 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10063 : : "incompatible vector types for "
10064 : : "invariants\n");
10065 : 0 : return false;
10066 : : }
10067 : : /* loop cost for vec_loop. */
10068 : 516 : inside_cost
10069 : 1032 : = record_stmt_cost (cost_vec,
10070 : 516 : SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10071 : : vector_stmt, stmt_info, 0, vect_body);
10072 : : /* prologue cost for vec_init (if not nested) and step. */
10073 : 516 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10074 : : scalar_to_vec,
10075 : : stmt_info, 0, vect_prologue);
10076 : : }
10077 : : else /* if (!slp_node) */
10078 : : {
10079 : : /* loop cost for vec_loop. */
10080 : 71443 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10081 : : stmt_info, 0, vect_body);
10082 : : /* prologue cost for vec_init and vec_step. */
10083 : 71443 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10084 : : stmt_info, 0, vect_prologue);
10085 : : }
10086 : 71959 : if (dump_enabled_p ())
10087 : 4498 : dump_printf_loc (MSG_NOTE, vect_location,
10088 : : "vect_model_induction_cost: inside_cost = %d, "
10089 : : "prologue_cost = %d .\n", inside_cost,
10090 : : prologue_cost);
10091 : :
10092 : 71959 : STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10093 : 71959 : DUMP_VECT_SCOPE ("vectorizable_induction");
10094 : 71959 : return true;
10095 : : }
10096 : :
10097 : : /* Transform. */
10098 : :
10099 : : /* Compute a vector variable, initialized with the first VF values of
10100 : : the induction variable. E.g., for an iv with IV_PHI='X' and
10101 : : evolution S, for a vector of 4 units, we want to compute:
10102 : : [X, X + S, X + 2*S, X + 3*S]. */
10103 : :
10104 : 12668 : if (dump_enabled_p ())
10105 : 2713 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10106 : :
10107 : 12668 : pe = loop_preheader_edge (iv_loop);
10108 : : /* Find the first insertion point in the BB. */
10109 : 12668 : basic_block bb = gimple_bb (phi);
10110 : 12668 : si = gsi_after_labels (bb);
10111 : :
10112 : : /* For SLP induction we have to generate several IVs as for example
10113 : : with group size 3 we need
10114 : : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10115 : : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10116 : 12668 : if (slp_node)
10117 : : {
10118 : : /* Enforced above. */
10119 : 203 : unsigned int const_nunits = nunits.to_constant ();
10120 : :
10121 : : /* The initial values are vectorized, but any lanes > group_size
10122 : : need adjustment. */
10123 : 203 : slp_tree init_node
10124 : 203 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10125 : :
10126 : : /* Gather steps. Since we do not vectorize inductions as
10127 : : cycles we have to reconstruct the step from SCEV data. */
10128 : 203 : unsigned group_size = SLP_TREE_LANES (slp_node);
10129 : 203 : tree *steps = XALLOCAVEC (tree, group_size);
10130 : 203 : tree *inits = XALLOCAVEC (tree, group_size);
10131 : 203 : stmt_vec_info phi_info;
10132 : 1590 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10133 : : {
10134 : 1387 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10135 : 1387 : if (!init_node)
10136 : 1363 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10137 : : pe->dest_idx);
10138 : : }
10139 : :
10140 : : /* Now generate the IVs. */
10141 : 203 : unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10142 : 203 : gcc_assert ((const_nunits * nvects) % group_size == 0);
10143 : 203 : unsigned nivs;
10144 : 203 : if (nested_in_vect_loop)
10145 : : nivs = nvects;
10146 : : else
10147 : : {
10148 : : /* Compute the number of distinct IVs we need. First reduce
10149 : : group_size if it is a multiple of const_nunits so we get
10150 : : one IV for a group_size of 4 but const_nunits 2. */
10151 : 191 : unsigned group_sizep = group_size;
10152 : 191 : if (group_sizep % const_nunits == 0)
10153 : 116 : group_sizep = group_sizep / const_nunits;
10154 : 191 : nivs = least_common_multiple (group_sizep,
10155 : 191 : const_nunits) / const_nunits;
10156 : : }
10157 : 203 : tree stept = TREE_TYPE (step_vectype);
10158 : 203 : tree lupdate_mul = NULL_TREE;
10159 : 203 : if (!nested_in_vect_loop)
10160 : : {
10161 : : /* The number of iterations covered in one vector iteration. */
10162 : 191 : unsigned lup_mul = (nvects * const_nunits) / group_size;
10163 : 191 : lupdate_mul
10164 : 191 : = build_vector_from_val (step_vectype,
10165 : 191 : SCALAR_FLOAT_TYPE_P (stept)
10166 : 7 : ? build_real_from_wide (stept, lup_mul,
10167 : : UNSIGNED)
10168 : 375 : : build_int_cstu (stept, lup_mul));
10169 : : }
10170 : 203 : tree peel_mul = NULL_TREE;
10171 : 203 : gimple_seq init_stmts = NULL;
10172 : 203 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10173 : : {
10174 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
10175 : 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10176 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10177 : : else
10178 : 0 : peel_mul = gimple_convert (&init_stmts, stept,
10179 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10180 : 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
10181 : : step_vectype, peel_mul);
10182 : : }
10183 : 203 : unsigned ivn;
10184 : 203 : auto_vec<tree> vec_steps;
10185 : 890 : for (ivn = 0; ivn < nivs; ++ivn)
10186 : : {
10187 : 687 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10188 : 687 : tree_vector_builder init_elts (vectype, const_nunits, 1);
10189 : 687 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10190 : 2925 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10191 : : {
10192 : : /* The scalar steps of the IVs. */
10193 : 2238 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10194 : 2238 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10195 : 2238 : step_elts.quick_push (elt);
10196 : 2238 : if (!init_node)
10197 : : {
10198 : : /* The scalar inits of the IVs if not vectorized. */
10199 : 2190 : elt = inits[(ivn*const_nunits + eltn) % group_size];
10200 : 2190 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
10201 : 2190 : TREE_TYPE (elt)))
10202 : 4 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10203 : 4 : TREE_TYPE (vectype), elt);
10204 : 2190 : init_elts.quick_push (elt);
10205 : : }
10206 : : /* The number of steps to add to the initial values. */
10207 : 2238 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10208 : 4476 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10209 : 4458 : ? build_real_from_wide (stept,
10210 : : mul_elt, UNSIGNED)
10211 : 4458 : : build_int_cstu (stept, mul_elt));
10212 : : }
10213 : 687 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
10214 : 687 : vec_steps.safe_push (vec_step);
10215 : 687 : tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10216 : 687 : if (peel_mul)
10217 : 0 : step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10218 : : step_mul, peel_mul);
10219 : 687 : if (!init_node)
10220 : 675 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
10221 : :
10222 : : /* Create the induction-phi that defines the induction-operand. */
10223 : 687 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10224 : : "vec_iv_");
10225 : 687 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
10226 : 687 : induc_def = PHI_RESULT (induction_phi);
10227 : :
10228 : : /* Create the iv update inside the loop */
10229 : 687 : tree up = vec_step;
10230 : 687 : if (lupdate_mul)
10231 : 675 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10232 : : vec_step, lupdate_mul);
10233 : 687 : gimple_seq stmts = NULL;
10234 : 687 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10235 : 687 : vec_def = gimple_build (&stmts,
10236 : : PLUS_EXPR, step_vectype, vec_def, up);
10237 : 687 : vec_def = gimple_convert (&stmts, vectype, vec_def);
10238 : 687 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10239 : 687 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10240 : : UNKNOWN_LOCATION);
10241 : :
10242 : 687 : if (init_node)
10243 : 12 : vec_init = vect_get_slp_vect_def (init_node, ivn);
10244 : 687 : if (!nested_in_vect_loop
10245 : 687 : && !integer_zerop (step_mul))
10246 : : {
10247 : 252 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10248 : 252 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10249 : : vec_step, step_mul);
10250 : 252 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10251 : : vec_def, up);
10252 : 252 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10253 : : }
10254 : :
10255 : : /* Set the arguments of the phi node: */
10256 : 687 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10257 : :
10258 : 687 : slp_node->push_vec_def (induction_phi);
10259 : 687 : }
10260 : 203 : if (!nested_in_vect_loop)
10261 : : {
10262 : : /* Fill up to the number of vectors we need for the whole group. */
10263 : 191 : nivs = least_common_multiple (group_size,
10264 : 191 : const_nunits) / const_nunits;
10265 : 191 : vec_steps.reserve (nivs-ivn);
10266 : 409 : for (; ivn < nivs; ++ivn)
10267 : : {
10268 : 27 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10269 : 27 : vec_steps.quick_push (vec_steps[0]);
10270 : : }
10271 : : }
10272 : :
10273 : : /* Re-use IVs when we can. We are generating further vector
10274 : : stmts by adding VF' * stride to the IVs generated above. */
10275 : 203 : if (ivn < nvects)
10276 : : {
10277 : 41 : unsigned vfp
10278 : 41 : = least_common_multiple (group_size, const_nunits) / group_size;
10279 : 41 : tree lupdate_mul
10280 : 41 : = build_vector_from_val (step_vectype,
10281 : 41 : SCALAR_FLOAT_TYPE_P (stept)
10282 : 0 : ? build_real_from_wide (stept,
10283 : : vfp, UNSIGNED)
10284 : 41 : : build_int_cstu (stept, vfp));
10285 : 356 : for (; ivn < nvects; ++ivn)
10286 : : {
10287 : 315 : gimple *iv
10288 : 315 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10289 : 315 : tree def = gimple_get_lhs (iv);
10290 : 315 : if (ivn < 2*nivs)
10291 : 115 : vec_steps[ivn - nivs]
10292 : 115 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10293 : 115 : vec_steps[ivn - nivs], lupdate_mul);
10294 : 315 : gimple_seq stmts = NULL;
10295 : 315 : def = gimple_convert (&stmts, step_vectype, def);
10296 : 945 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10297 : 315 : def, vec_steps[ivn % nivs]);
10298 : 315 : def = gimple_convert (&stmts, vectype, def);
10299 : 315 : if (gimple_code (iv) == GIMPLE_PHI)
10300 : 115 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10301 : : else
10302 : : {
10303 : 200 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10304 : 200 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10305 : : }
10306 : 315 : slp_node->push_vec_def (def);
10307 : : }
10308 : : }
10309 : :
10310 : 203 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10311 : 203 : gcc_assert (!new_bb);
10312 : :
10313 : 203 : return true;
10314 : 203 : }
10315 : :
10316 : 12465 : init_expr = vect_phi_initial_value (phi);
10317 : :
10318 : 12465 : gimple_seq stmts = NULL;
10319 : 12465 : if (!nested_in_vect_loop)
10320 : : {
10321 : : /* Convert the initial value to the IV update type. */
10322 : 12386 : tree new_type = TREE_TYPE (step_expr);
10323 : 12386 : init_expr = gimple_convert (&stmts, new_type, init_expr);
10324 : :
10325 : : /* If we are using the loop mask to "peel" for alignment then we need
10326 : : to adjust the start value here. */
10327 : 12386 : tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10328 : 12386 : if (skip_niters != NULL_TREE)
10329 : : {
10330 : 0 : if (FLOAT_TYPE_P (vectype))
10331 : 0 : skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10332 : : skip_niters);
10333 : : else
10334 : 0 : skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10335 : 0 : tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10336 : : skip_niters, step_expr);
10337 : 0 : init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10338 : : init_expr, skip_step);
10339 : : }
10340 : : }
10341 : :
10342 : 12465 : if (stmts)
10343 : : {
10344 : 90 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10345 : 90 : gcc_assert (!new_bb);
10346 : : }
10347 : :
10348 : : /* Create the vector that holds the initial_value of the induction. */
10349 : 12465 : if (nested_in_vect_loop)
10350 : : {
10351 : : /* iv_loop is nested in the loop to be vectorized. init_expr had already
10352 : : been created during vectorization of previous stmts. We obtain it
10353 : : from the STMT_VINFO_VEC_STMT of the defining stmt. */
10354 : 79 : auto_vec<tree> vec_inits;
10355 : 79 : vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10356 : : init_expr, &vec_inits);
10357 : 79 : vec_init = vec_inits[0];
10358 : : /* If the initial value is not of proper type, convert it. */
10359 : 79 : if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10360 : : {
10361 : 0 : new_stmt
10362 : 0 : = gimple_build_assign (vect_get_new_ssa_name (vectype,
10363 : : vect_simple_var,
10364 : : "vec_iv_"),
10365 : : VIEW_CONVERT_EXPR,
10366 : : build1 (VIEW_CONVERT_EXPR, vectype,
10367 : : vec_init));
10368 : 0 : vec_init = gimple_assign_lhs (new_stmt);
10369 : 0 : new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10370 : : new_stmt);
10371 : 0 : gcc_assert (!new_bb);
10372 : : }
10373 : 79 : }
10374 : : else
10375 : : {
10376 : : /* iv_loop is the loop to be vectorized. Create:
10377 : : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10378 : 12386 : stmts = NULL;
10379 : 12386 : new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10380 : :
10381 : 12386 : unsigned HOST_WIDE_INT const_nunits;
10382 : 12386 : if (nunits.is_constant (&const_nunits))
10383 : : {
10384 : 12386 : tree_vector_builder elts (step_vectype, const_nunits, 1);
10385 : 12386 : elts.quick_push (new_name);
10386 : 81118 : for (i = 1; i < const_nunits; i++)
10387 : : {
10388 : : /* Create: new_name_i = new_name + step_expr */
10389 : 56346 : new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10390 : : new_name, step_expr);
10391 : 56346 : elts.quick_push (new_name);
10392 : : }
10393 : : /* Create a vector from [new_name_0, new_name_1, ...,
10394 : : new_name_nunits-1] */
10395 : 12386 : vec_init = gimple_build_vector (&stmts, &elts);
10396 : 12386 : }
10397 : : else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10398 : : /* Build the initial value directly from a VEC_SERIES_EXPR. */
10399 : : vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10400 : : new_name, step_expr);
10401 : : else
10402 : : {
10403 : : /* Build:
10404 : : [base, base, base, ...]
10405 : : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10406 : : gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10407 : : gcc_assert (flag_associative_math);
10408 : : tree index = build_index_vector (step_vectype, 0, 1);
10409 : : tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10410 : : new_name);
10411 : : tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10412 : : step_expr);
10413 : : vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10414 : : vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10415 : : vec_init, step_vec);
10416 : : vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10417 : : vec_init, base_vec);
10418 : : }
10419 : 12386 : vec_init = gimple_convert (&stmts, vectype, vec_init);
10420 : :
10421 : 12386 : if (stmts)
10422 : : {
10423 : 2582 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10424 : 2582 : gcc_assert (!new_bb);
10425 : : }
10426 : : }
10427 : :
10428 : :
10429 : : /* Create the vector that holds the step of the induction. */
10430 : 12465 : gimple_stmt_iterator *step_iv_si = NULL;
10431 : 12465 : if (nested_in_vect_loop)
10432 : : /* iv_loop is nested in the loop to be vectorized. Generate:
10433 : : vec_step = [S, S, S, S] */
10434 : 79 : new_name = step_expr;
10435 : 12386 : else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10436 : : {
10437 : : /* When we're using loop_len produced by SELEC_VL, the non-final
10438 : : iterations are not always processing VF elements. So vectorize
10439 : : induction variable instead of
10440 : :
10441 : : _21 = vect_vec_iv_.6_22 + { VF, ... };
10442 : :
10443 : : We should generate:
10444 : :
10445 : : _35 = .SELECT_VL (ivtmp_33, VF);
10446 : : vect_cst__22 = [vec_duplicate_expr] _35;
10447 : : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10448 : 0 : gcc_assert (!slp_node);
10449 : 0 : gimple_seq seq = NULL;
10450 : 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10451 : 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10452 : 0 : expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10453 : : unshare_expr (len)),
10454 : : &seq, true, NULL_TREE);
10455 : 0 : new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10456 : : step_expr);
10457 : 0 : gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10458 : 0 : step_iv_si = &si;
10459 : : }
10460 : : else
10461 : : {
10462 : : /* iv_loop is the loop to be vectorized. Generate:
10463 : : vec_step = [VF*S, VF*S, VF*S, VF*S] */
10464 : 12386 : gimple_seq seq = NULL;
10465 : 12386 : if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10466 : : {
10467 : 20 : expr = build_int_cst (integer_type_node, vf);
10468 : 20 : expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10469 : : }
10470 : : else
10471 : 12366 : expr = build_int_cst (TREE_TYPE (step_expr), vf);
10472 : 12386 : new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10473 : : expr, step_expr);
10474 : 12386 : if (seq)
10475 : : {
10476 : 347 : new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10477 : 347 : gcc_assert (!new_bb);
10478 : : }
10479 : : }
10480 : :
10481 : 12465 : t = unshare_expr (new_name);
10482 : 12465 : gcc_assert (CONSTANT_CLASS_P (new_name)
10483 : : || TREE_CODE (new_name) == SSA_NAME);
10484 : 12465 : new_vec = build_vector_from_val (step_vectype, t);
10485 : 12465 : vec_step = vect_init_vector (loop_vinfo, stmt_info,
10486 : : new_vec, step_vectype, step_iv_si);
10487 : :
10488 : :
10489 : : /* Create the following def-use cycle:
10490 : : loop prolog:
10491 : : vec_init = ...
10492 : : vec_step = ...
10493 : : loop:
10494 : : vec_iv = PHI <vec_init, vec_loop>
10495 : : ...
10496 : : STMT
10497 : : ...
10498 : : vec_loop = vec_iv + vec_step; */
10499 : :
10500 : : /* Create the induction-phi that defines the induction-operand. */
10501 : 12465 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10502 : 12465 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
10503 : 12465 : induc_def = PHI_RESULT (induction_phi);
10504 : :
10505 : : /* Create the iv update inside the loop */
10506 : 12465 : stmts = NULL;
10507 : 12465 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10508 : 12465 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10509 : 12465 : vec_def = gimple_convert (&stmts, vectype, vec_def);
10510 : 12465 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10511 : 12465 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
10512 : :
10513 : : /* Set the arguments of the phi node: */
10514 : 12465 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10515 : 12465 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10516 : : UNKNOWN_LOCATION);
10517 : :
10518 : 12465 : STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10519 : 12465 : *vec_stmt = induction_phi;
10520 : :
10521 : : /* In case that vectorization factor (VF) is bigger than the number
10522 : : of elements that we can fit in a vectype (nunits), we have to generate
10523 : : more than one vector stmt - i.e - we need to "unroll" the
10524 : : vector stmt by a factor VF/nunits. For more details see documentation
10525 : : in vectorizable_operation. */
10526 : :
10527 : 12465 : if (ncopies > 1)
10528 : : {
10529 : 2643 : gimple_seq seq = NULL;
10530 : : /* FORNOW. This restriction should be relaxed. */
10531 : 2643 : gcc_assert (!nested_in_vect_loop);
10532 : : /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
10533 : 2643 : gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10534 : :
10535 : : /* Create the vector that holds the step of the induction. */
10536 : 2643 : if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10537 : : {
10538 : 8 : expr = build_int_cst (integer_type_node, nunits);
10539 : 8 : expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10540 : : }
10541 : : else
10542 : 2635 : expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10543 : 2643 : new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10544 : : expr, step_expr);
10545 : 2643 : if (seq)
10546 : : {
10547 : 149 : new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10548 : 149 : gcc_assert (!new_bb);
10549 : : }
10550 : :
10551 : 2643 : t = unshare_expr (new_name);
10552 : 2643 : gcc_assert (CONSTANT_CLASS_P (new_name)
10553 : : || TREE_CODE (new_name) == SSA_NAME);
10554 : 2643 : new_vec = build_vector_from_val (step_vectype, t);
10555 : 2643 : vec_step = vect_init_vector (loop_vinfo, stmt_info,
10556 : : new_vec, step_vectype, NULL);
10557 : :
10558 : 2643 : vec_def = induc_def;
10559 : 10617 : for (i = 1; i < ncopies + 1; i++)
10560 : : {
10561 : : /* vec_i = vec_prev + vec_step */
10562 : 7974 : gimple_seq stmts = NULL;
10563 : 7974 : vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10564 : 7974 : vec_def = gimple_build (&stmts,
10565 : : PLUS_EXPR, step_vectype, vec_def, vec_step);
10566 : 7974 : vec_def = gimple_convert (&stmts, vectype, vec_def);
10567 : :
10568 : 7974 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10569 : 7974 : if (i < ncopies)
10570 : : {
10571 : 5331 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
10572 : 5331 : STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10573 : : }
10574 : : else
10575 : : {
10576 : : /* vec_1 = vec_iv + (VF/n * S)
10577 : : vec_2 = vec_1 + (VF/n * S)
10578 : : ...
10579 : : vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10580 : :
10581 : : vec_n is used as vec_loop to save the large step register and
10582 : : related operations. */
10583 : 2643 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10584 : : UNKNOWN_LOCATION);
10585 : : }
10586 : : }
10587 : : }
10588 : :
10589 : 12465 : if (dump_enabled_p ())
10590 : 5266 : dump_printf_loc (MSG_NOTE, vect_location,
10591 : : "transform induction: created def-use cycle: %G%G",
10592 : 2633 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10593 : :
10594 : : return true;
10595 : : }
10596 : :
10597 : : /* Function vectorizable_live_operation_1.
10598 : :
10599 : : helper function for vectorizable_live_operation. */
10600 : :
10601 : : static tree
10602 : 4659 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10603 : : stmt_vec_info stmt_info, basic_block exit_bb,
10604 : : tree vectype, int ncopies, slp_tree slp_node,
10605 : : tree bitsize, tree bitstart, tree vec_lhs,
10606 : : tree lhs_type, gimple_stmt_iterator *exit_gsi)
10607 : : {
10608 : 9318 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10609 : :
10610 : 4659 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10611 : 4659 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10612 : 9684 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10613 : 5025 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
10614 : :
10615 : 4659 : gimple_seq stmts = NULL;
10616 : 4659 : tree new_tree;
10617 : :
10618 : : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10619 : 4659 : if (integer_zerop (bitstart))
10620 : : {
10621 : 2232 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10622 : : vec_lhs_phi, bitsize, bitstart);
10623 : :
10624 : : /* Convert the extracted vector element to the scalar type. */
10625 : 2232 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10626 : : }
10627 : 2427 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10628 : : {
10629 : : /* Emit:
10630 : :
10631 : : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10632 : :
10633 : : where VEC_LHS is the vectorized live-out result and MASK is
10634 : : the loop mask for the final iteration. */
10635 : 0 : gcc_assert (ncopies == 1 && !slp_node);
10636 : 0 : gimple_seq tem = NULL;
10637 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10638 : 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
10639 : : &LOOP_VINFO_LENS (loop_vinfo),
10640 : : 1, vectype, 0, 0);
10641 : :
10642 : : /* BIAS - 1. */
10643 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10644 : 0 : tree bias_minus_one
10645 : 0 : = int_const_binop (MINUS_EXPR,
10646 : 0 : build_int_cst (TREE_TYPE (len), biasval),
10647 : 0 : build_one_cst (TREE_TYPE (len)));
10648 : :
10649 : : /* LAST_INDEX = LEN + (BIAS - 1). */
10650 : 0 : tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10651 : : len, bias_minus_one);
10652 : :
10653 : : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10654 : 0 : tree scalar_res
10655 : 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10656 : : vec_lhs_phi, last_index);
10657 : :
10658 : : /* Convert the extracted vector element to the scalar type. */
10659 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10660 : : }
10661 : 2427 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10662 : : {
10663 : : /* Emit:
10664 : :
10665 : : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10666 : :
10667 : : where VEC_LHS is the vectorized live-out result and MASK is
10668 : : the loop mask for the final iteration. */
10669 : 0 : gcc_assert (!slp_node);
10670 : 0 : tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10671 : 0 : gimple_seq tem = NULL;
10672 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10673 : 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10674 : : &LOOP_VINFO_MASKS (loop_vinfo),
10675 : : 1, vectype, 0);
10676 : 0 : tree scalar_res;
10677 : 0 : gimple_seq_add_seq (&stmts, tem);
10678 : :
10679 : 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10680 : : mask, vec_lhs_phi);
10681 : :
10682 : : /* Convert the extracted vector element to the scalar type. */
10683 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10684 : : }
10685 : : else
10686 : : {
10687 : 2427 : tree bftype = TREE_TYPE (vectype);
10688 : 2427 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10689 : 112 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10690 : 2427 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10691 : 2427 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10692 : : &stmts, true, NULL_TREE);
10693 : : }
10694 : :
10695 : 4659 : *exit_gsi = gsi_after_labels (exit_bb);
10696 : 4659 : if (stmts)
10697 : 4659 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10698 : :
10699 : 4659 : return new_tree;
10700 : : }
10701 : :
10702 : : /* Function vectorizable_live_operation.
10703 : :
10704 : : STMT_INFO computes a value that is used outside the loop. Check if
10705 : : it can be supported. */
10706 : :
10707 : : bool
10708 : 234705 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10709 : : slp_tree slp_node, slp_instance slp_node_instance,
10710 : : int slp_index, bool vec_stmt_p,
10711 : : stmt_vector_for_cost *cost_vec)
10712 : : {
10713 : 234705 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10714 : 234705 : imm_use_iterator imm_iter;
10715 : 234705 : tree lhs, lhs_type, bitsize;
10716 : 469410 : tree vectype = (slp_node
10717 : 234705 : ? SLP_TREE_VECTYPE (slp_node)
10718 : : : STMT_VINFO_VECTYPE (stmt_info));
10719 : 234705 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10720 : 234705 : int ncopies;
10721 : 234705 : gimple *use_stmt;
10722 : 234705 : use_operand_p use_p;
10723 : 234705 : auto_vec<tree> vec_oprnds;
10724 : 234705 : int vec_entry = 0;
10725 : 234705 : poly_uint64 vec_index = 0;
10726 : :
10727 : 234705 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10728 : : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10729 : :
10730 : : /* If a stmt of a reduction is live, vectorize it via
10731 : : vect_create_epilog_for_reduction. vectorizable_reduction assessed
10732 : : validity so just trigger the transform here. */
10733 : 236282 : if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10734 : : {
10735 : 56594 : if (!vec_stmt_p)
10736 : : return true;
10737 : : /* For SLP reductions we vectorize the epilogue for all involved stmts
10738 : : together. */
10739 : 17997 : if (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) && slp_index != 0)
10740 : : return true;
10741 : 17296 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10742 : 17296 : gcc_assert (reduc_info->is_reduc_info);
10743 : 17296 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10744 : 17296 : || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10745 : : return true;
10746 : :
10747 : 16655 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10748 : 16655 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10749 : 16653 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10750 : : slp_node_instance,
10751 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10752 : :
10753 : : /* If early break we only have to materialize the reduction on the merge
10754 : : block, but we have to find an alternate exit first. */
10755 : 16655 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10756 : : {
10757 : 14 : slp_tree phis_node = slp_node ? slp_node_instance->reduc_phis : NULL;
10758 : 42 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10759 : 14 : if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10760 : : {
10761 : 14 : vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
10762 : : phis_node, slp_node_instance,
10763 : : exit);
10764 : 14 : break;
10765 : 14 : }
10766 : 14 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10767 : 2 : vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
10768 : : phis_node, slp_node_instance,
10769 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10770 : : }
10771 : :
10772 : 16655 : return true;
10773 : : }
10774 : :
10775 : : /* If STMT is not relevant and it is a simple assignment and its inputs are
10776 : : invariant then it can remain in place, unvectorized. The original last
10777 : : scalar value that it computes will be used. */
10778 : 178111 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10779 : : {
10780 : 76 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10781 : 76 : if (dump_enabled_p ())
10782 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
10783 : : "statement is simple and uses invariant. Leaving in "
10784 : : "place.\n");
10785 : 76 : return true;
10786 : : }
10787 : :
10788 : 178035 : if (slp_node)
10789 : : ncopies = 1;
10790 : : else
10791 : 54672 : ncopies = vect_get_num_copies (loop_vinfo, vectype);
10792 : :
10793 : 54672 : if (slp_node)
10794 : : {
10795 : 123363 : gcc_assert (slp_index >= 0);
10796 : :
10797 : : /* Get the last occurrence of the scalar index from the concatenation of
10798 : : all the slp vectors. Calculate which slp vector it is and the index
10799 : : within. */
10800 : 123363 : int num_scalar = SLP_TREE_LANES (slp_node);
10801 : 123363 : int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10802 : 123363 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10803 : :
10804 : : /* Calculate which vector contains the result, and which lane of
10805 : : that vector we need. */
10806 : 123363 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10807 : : {
10808 : : if (dump_enabled_p ())
10809 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10810 : : "Cannot determine which vector holds the"
10811 : : " final result.\n");
10812 : : return false;
10813 : : }
10814 : : }
10815 : :
10816 : 178035 : if (!vec_stmt_p)
10817 : : {
10818 : : /* No transformation required. */
10819 : 133949 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10820 : : {
10821 : 3 : if (slp_node)
10822 : : {
10823 : 0 : if (dump_enabled_p ())
10824 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10825 : : "can't operate on partial vectors "
10826 : : "because an SLP statement is live after "
10827 : : "the loop.\n");
10828 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10829 : : }
10830 : 3 : else if (ncopies > 1)
10831 : : {
10832 : 0 : if (dump_enabled_p ())
10833 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10834 : : "can't operate on partial vectors "
10835 : : "because ncopies is greater than 1.\n");
10836 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10837 : : }
10838 : : else
10839 : : {
10840 : 3 : gcc_assert (ncopies == 1 && !slp_node);
10841 : 3 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10842 : : OPTIMIZE_FOR_SPEED))
10843 : 0 : vect_record_loop_mask (loop_vinfo,
10844 : : &LOOP_VINFO_MASKS (loop_vinfo),
10845 : : 1, vectype, NULL);
10846 : 3 : else if (can_vec_extract_var_idx_p (
10847 : 3 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10848 : 0 : vect_record_loop_len (loop_vinfo,
10849 : : &LOOP_VINFO_LENS (loop_vinfo),
10850 : : 1, vectype, 1);
10851 : : else
10852 : : {
10853 : 3 : if (dump_enabled_p ())
10854 : 3 : dump_printf_loc (
10855 : : MSG_MISSED_OPTIMIZATION, vect_location,
10856 : : "can't operate on partial vectors "
10857 : : "because the target doesn't support extract "
10858 : : "last reduction.\n");
10859 : 3 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10860 : : }
10861 : : }
10862 : : }
10863 : : /* ??? Enable for loop costing as well. */
10864 : 3 : if (!loop_vinfo)
10865 : 83361 : record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10866 : : 0, vect_epilogue);
10867 : 133949 : return true;
10868 : : }
10869 : :
10870 : : /* Use the lhs of the original scalar statement. */
10871 : 44086 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10872 : 44086 : if (dump_enabled_p ())
10873 : 1468 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10874 : : "stmt %G", stmt);
10875 : :
10876 : 44086 : lhs = gimple_get_lhs (stmt);
10877 : 44086 : lhs_type = TREE_TYPE (lhs);
10878 : :
10879 : 44086 : bitsize = vector_element_bits_tree (vectype);
10880 : :
10881 : : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10882 : 44086 : tree vec_lhs, vec_lhs0, bitstart;
10883 : 44086 : gimple *vec_stmt, *vec_stmt0;
10884 : 44086 : if (slp_node)
10885 : : {
10886 : 39610 : gcc_assert (!loop_vinfo
10887 : : || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10888 : : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10889 : :
10890 : : /* Get the correct slp vectorized stmt. */
10891 : 39610 : vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10892 : 39610 : vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10893 : :
10894 : : /* In case we need to early break vectorize also get the first stmt. */
10895 : 39610 : vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10896 : 39610 : vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10897 : :
10898 : : /* Get entry to use. */
10899 : 39610 : bitstart = bitsize_int (vec_index);
10900 : 39610 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10901 : : }
10902 : : else
10903 : : {
10904 : : /* For multiple copies, get the last copy. */
10905 : 4476 : vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10906 : 4476 : vec_lhs = gimple_get_lhs (vec_stmt);
10907 : :
10908 : : /* In case we need to early break vectorize also get the first stmt. */
10909 : 4476 : vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10910 : 4476 : vec_lhs0 = gimple_get_lhs (vec_stmt0);
10911 : :
10912 : : /* Get the last lane in the vector. */
10913 : 4476 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10914 : : }
10915 : :
10916 : 44086 : if (loop_vinfo)
10917 : : {
10918 : : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10919 : : requirement, insert one phi node for it. It looks like:
10920 : : loop;
10921 : : BB:
10922 : : # lhs' = PHI <lhs>
10923 : : ==>
10924 : : loop;
10925 : : BB:
10926 : : # vec_lhs' = PHI <vec_lhs>
10927 : : new_tree = lane_extract <vec_lhs', ...>;
10928 : : lhs' = new_tree; */
10929 : :
10930 : 4718 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10931 : : /* Check if we have a loop where the chosen exit is not the main exit,
10932 : : in these cases for an early break we restart the iteration the vector code
10933 : : did. For the live values we want the value at the start of the iteration
10934 : : rather than at the end. */
10935 : 4718 : edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10936 : 4718 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10937 : 20474 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10938 : 15756 : if (!is_gimple_debug (use_stmt)
10939 : 15756 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10940 : 4659 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10941 : : {
10942 : 4659 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10943 : 4659 : phi_arg_index_from_use (use_p));
10944 : 4659 : gcc_assert (loop_exit_edge_p (loop, e));
10945 : 4659 : bool main_exit_edge = e == main_e;
10946 : 4659 : tree tmp_vec_lhs = vec_lhs;
10947 : 4659 : tree tmp_bitstart = bitstart;
10948 : :
10949 : : /* For early exit where the exit is not in the BB that leads
10950 : : to the latch then we're restarting the iteration in the
10951 : : scalar loop. So get the first live value. */
10952 : 4659 : if ((all_exits_as_early_p || !main_exit_edge)
10953 : 2212 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10954 : : {
10955 : 2208 : tmp_vec_lhs = vec_lhs0;
10956 : 2208 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10957 : : }
10958 : :
10959 : 4659 : gimple_stmt_iterator exit_gsi;
10960 : 4659 : tree new_tree
10961 : 4659 : = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10962 : : e->dest, vectype, ncopies,
10963 : : slp_node, bitsize,
10964 : : tmp_bitstart, tmp_vec_lhs,
10965 : : lhs_type, &exit_gsi);
10966 : :
10967 : 4659 : auto gsi = gsi_for_stmt (use_stmt);
10968 : 4659 : tree lhs_phi = gimple_phi_result (use_stmt);
10969 : 4659 : remove_phi_node (&gsi, false);
10970 : 4659 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10971 : 4659 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10972 : 4659 : break;
10973 : 4718 : }
10974 : :
10975 : : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10976 : 15815 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10977 : 15815 : gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10978 : : }
10979 : : else
10980 : : {
10981 : : /* For basic-block vectorization simply insert the lane-extraction. */
10982 : 39368 : tree bftype = TREE_TYPE (vectype);
10983 : 39368 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10984 : 0 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10985 : 39368 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
10986 : : vec_lhs, bitsize, bitstart);
10987 : 39368 : gimple_seq stmts = NULL;
10988 : 39368 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10989 : : &stmts, true, NULL_TREE);
10990 : 39368 : if (TREE_CODE (new_tree) == SSA_NAME
10991 : 78736 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10992 : 3 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10993 : 39368 : if (is_a <gphi *> (vec_stmt))
10994 : : {
10995 : 3135 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10996 : 3135 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10997 : : }
10998 : : else
10999 : : {
11000 : 36233 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11001 : 36233 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11002 : : }
11003 : :
11004 : : /* Replace use of lhs with newly computed result. If the use stmt is a
11005 : : single arg PHI, just replace all uses of PHI result. It's necessary
11006 : : because lcssa PHI defining lhs may be before newly inserted stmt. */
11007 : 39368 : use_operand_p use_p;
11008 : 39368 : stmt_vec_info use_stmt_info;
11009 : 202752 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11010 : 163384 : if (!is_gimple_debug (use_stmt)
11011 : 163384 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11012 : 114955 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11013 : : {
11014 : : /* ??? This can happen when the live lane ends up being
11015 : : rooted in a vector construction code-generated by an
11016 : : external SLP node (and code-generation for that already
11017 : : happened). See gcc.dg/vect/bb-slp-47.c.
11018 : : Doing this is what would happen if that vector CTOR
11019 : : were not code-generated yet so it is not too bad.
11020 : : ??? In fact we'd likely want to avoid this situation
11021 : : in the first place. */
11022 : 70961 : if (TREE_CODE (new_tree) == SSA_NAME
11023 : 70877 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11024 : 70877 : && gimple_code (use_stmt) != GIMPLE_PHI
11025 : 134805 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11026 : : use_stmt))
11027 : : {
11028 : 84 : if (dump_enabled_p ())
11029 : 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11030 : : "Using original scalar computation for "
11031 : : "live lane because use preceeds vector "
11032 : : "def\n");
11033 : 84 : continue;
11034 : : }
11035 : : /* ??? It can also happen that we end up pulling a def into
11036 : : a loop where replacing out-of-loop uses would require
11037 : : a new LC SSA PHI node. Retain the original scalar in
11038 : : those cases as well. PR98064. */
11039 : 73276 : if (TREE_CODE (new_tree) == SSA_NAME
11040 : 70793 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11041 : 70793 : && (gimple_bb (use_stmt)->loop_father
11042 : 70793 : != gimple_bb (vec_stmt)->loop_father)
11043 : 79535 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11044 : 8742 : gimple_bb (use_stmt)->loop_father))
11045 : : {
11046 : 2483 : if (dump_enabled_p ())
11047 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11048 : : "Using original scalar computation for "
11049 : : "live lane because there is an out-of-loop "
11050 : : "definition for it\n");
11051 : 2483 : continue;
11052 : : }
11053 : 207104 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11054 : 69397 : SET_USE (use_p, new_tree);
11055 : 68310 : update_stmt (use_stmt);
11056 : 39368 : }
11057 : : }
11058 : :
11059 : : return true;
11060 : 234705 : }
11061 : :
11062 : : /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
11063 : :
11064 : : static void
11065 : 182383 : vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11066 : : {
11067 : 182383 : ssa_op_iter op_iter;
11068 : 182383 : imm_use_iterator imm_iter;
11069 : 182383 : def_operand_p def_p;
11070 : 182383 : gimple *ustmt;
11071 : :
11072 : 525560 : FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11073 : : {
11074 : 400632 : FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11075 : : {
11076 : 239838 : basic_block bb;
11077 : :
11078 : 239838 : if (!is_gimple_debug (ustmt))
11079 : 217966 : continue;
11080 : :
11081 : 21872 : bb = gimple_bb (ustmt);
11082 : :
11083 : 21872 : if (!flow_bb_inside_loop_p (loop, bb))
11084 : : {
11085 : 6 : if (gimple_debug_bind_p (ustmt))
11086 : : {
11087 : 6 : if (dump_enabled_p ())
11088 : 6 : dump_printf_loc (MSG_NOTE, vect_location,
11089 : : "killing debug use\n");
11090 : :
11091 : 6 : gimple_debug_bind_reset_value (ustmt);
11092 : 6 : update_stmt (ustmt);
11093 : : }
11094 : : else
11095 : 0 : gcc_unreachable ();
11096 : : }
11097 : 160794 : }
11098 : : }
11099 : 182383 : }
11100 : :
11101 : : /* Given loop represented by LOOP_VINFO, return true if computation of
11102 : : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11103 : : otherwise. */
11104 : :
11105 : : static bool
11106 : 45294 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
11107 : : {
11108 : : /* Constant case. */
11109 : 45294 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11110 : : {
11111 : 28874 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11112 : 28874 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11113 : :
11114 : 28874 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11115 : 28874 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11116 : 28874 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11117 : : return true;
11118 : : }
11119 : :
11120 : 16420 : widest_int max;
11121 : 16420 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11122 : : /* Check the upper bound of loop niters. */
11123 : 16420 : if (get_max_loop_iterations (loop, &max))
11124 : : {
11125 : 16420 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11126 : 16420 : signop sgn = TYPE_SIGN (type);
11127 : 16420 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11128 : 16420 : if (max < type_max)
11129 : 16355 : return true;
11130 : 16420 : }
11131 : : return false;
11132 : 16420 : }
11133 : :
11134 : : /* Return a mask type with half the number of elements as OLD_TYPE,
11135 : : given that it should have mode NEW_MODE. */
11136 : :
11137 : : tree
11138 : 2101 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11139 : : {
11140 : 2101 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11141 : 2101 : return build_truth_vector_type_for_mode (nunits, new_mode);
11142 : : }
11143 : :
11144 : : /* Return a mask type with twice as many elements as OLD_TYPE,
11145 : : given that it should have mode NEW_MODE. */
11146 : :
11147 : : tree
11148 : 1029 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11149 : : {
11150 : 1029 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11151 : 1029 : return build_truth_vector_type_for_mode (nunits, new_mode);
11152 : : }
11153 : :
11154 : : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11155 : : contain a sequence of NVECTORS masks that each control a vector of type
11156 : : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11157 : : these vector masks with the vector version of SCALAR_MASK. */
11158 : :
11159 : : void
11160 : 53 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11161 : : unsigned int nvectors, tree vectype, tree scalar_mask)
11162 : : {
11163 : 53 : gcc_assert (nvectors != 0);
11164 : :
11165 : 53 : if (scalar_mask)
11166 : : {
11167 : 18 : scalar_cond_masked_key cond (scalar_mask, nvectors);
11168 : 18 : loop_vinfo->scalar_cond_masked_set.add (cond);
11169 : : }
11170 : :
11171 : 53 : masks->mask_set.add (std::make_pair (vectype, nvectors));
11172 : 53 : }
11173 : :
11174 : : /* Given a complete set of masks MASKS, extract mask number INDEX
11175 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11176 : : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11177 : :
11178 : : See the comment above vec_loop_masks for more details about the mask
11179 : : arrangement. */
11180 : :
11181 : : tree
11182 : 58 : vect_get_loop_mask (loop_vec_info loop_vinfo,
11183 : : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11184 : : unsigned int nvectors, tree vectype, unsigned int index)
11185 : : {
11186 : 58 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11187 : : == vect_partial_vectors_while_ult)
11188 : : {
11189 : 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11190 : 0 : tree mask_type = rgm->type;
11191 : :
11192 : : /* Populate the rgroup's mask array, if this is the first time we've
11193 : : used it. */
11194 : 0 : if (rgm->controls.is_empty ())
11195 : : {
11196 : 0 : rgm->controls.safe_grow_cleared (nvectors, true);
11197 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
11198 : : {
11199 : 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11200 : : /* Provide a dummy definition until the real one is available. */
11201 : 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11202 : 0 : rgm->controls[i] = mask;
11203 : : }
11204 : : }
11205 : :
11206 : 0 : tree mask = rgm->controls[index];
11207 : 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11208 : 0 : TYPE_VECTOR_SUBPARTS (vectype)))
11209 : : {
11210 : : /* A loop mask for data type X can be reused for data type Y
11211 : : if X has N times more elements than Y and if Y's elements
11212 : : are N times bigger than X's. In this case each sequence
11213 : : of N elements in the loop mask will be all-zero or all-one.
11214 : : We can then view-convert the mask so that each sequence of
11215 : : N elements is replaced by a single element. */
11216 : 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11217 : : TYPE_VECTOR_SUBPARTS (vectype)));
11218 : 0 : gimple_seq seq = NULL;
11219 : 0 : mask_type = truth_type_for (vectype);
11220 : 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11221 : 0 : if (seq)
11222 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11223 : : }
11224 : 0 : return mask;
11225 : : }
11226 : 58 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11227 : : == vect_partial_vectors_avx512)
11228 : : {
11229 : : /* The number of scalars per iteration and the number of vectors are
11230 : : both compile-time constants. */
11231 : 58 : unsigned int nscalars_per_iter
11232 : 58 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11233 : 58 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11234 : :
11235 : 58 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11236 : :
11237 : : /* The stored nV is dependent on the mask type produced. */
11238 : 58 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11239 : : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11240 : : == rgm->factor);
11241 : 58 : nvectors = rgm->factor;
11242 : :
11243 : : /* Populate the rgroup's mask array, if this is the first time we've
11244 : : used it. */
11245 : 58 : if (rgm->controls.is_empty ())
11246 : : {
11247 : 5 : rgm->controls.safe_grow_cleared (nvectors, true);
11248 : 11 : for (unsigned int i = 0; i < nvectors; ++i)
11249 : : {
11250 : 6 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11251 : : /* Provide a dummy definition until the real one is available. */
11252 : 6 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11253 : 6 : rgm->controls[i] = mask;
11254 : : }
11255 : : }
11256 : 58 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11257 : : TYPE_VECTOR_SUBPARTS (vectype)))
11258 : 58 : return rgm->controls[index];
11259 : :
11260 : : /* Split the vector if needed. Since we are dealing with integer mode
11261 : : masks with AVX512 we can operate on the integer representation
11262 : : performing the whole vector shifting. */
11263 : 0 : unsigned HOST_WIDE_INT factor;
11264 : 0 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11265 : 0 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
11266 : 0 : gcc_assert (ok);
11267 : 0 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11268 : 0 : tree mask_type = truth_type_for (vectype);
11269 : 0 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11270 : 0 : unsigned vi = index / factor;
11271 : 0 : unsigned vpart = index % factor;
11272 : 0 : tree vec = rgm->controls[vi];
11273 : 0 : gimple_seq seq = NULL;
11274 : 0 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11275 : 0 : lang_hooks.types.type_for_mode
11276 : 0 : (TYPE_MODE (rgm->type), 1), vec);
11277 : : /* For integer mode masks simply shift the right bits into position. */
11278 : 0 : if (vpart != 0)
11279 : 0 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11280 : : build_int_cst (integer_type_node,
11281 : 0 : (TYPE_VECTOR_SUBPARTS (vectype)
11282 : 0 : * vpart)));
11283 : 0 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11284 : 0 : (TYPE_MODE (mask_type), 1), vec);
11285 : 0 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11286 : 0 : if (seq)
11287 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11288 : 0 : return vec;
11289 : : }
11290 : : else
11291 : 0 : gcc_unreachable ();
11292 : : }
11293 : :
11294 : : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11295 : : lengths for controlling an operation on VECTYPE. The operation splits
11296 : : each element of VECTYPE into FACTOR separate subelements, measuring the
11297 : : length as a number of these subelements. */
11298 : :
11299 : : void
11300 : 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11301 : : unsigned int nvectors, tree vectype, unsigned int factor)
11302 : : {
11303 : 0 : gcc_assert (nvectors != 0);
11304 : 0 : if (lens->length () < nvectors)
11305 : 0 : lens->safe_grow_cleared (nvectors, true);
11306 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
11307 : :
11308 : : /* The number of scalars per iteration, scalar occupied bytes and
11309 : : the number of vectors are both compile-time constants. */
11310 : 0 : unsigned int nscalars_per_iter
11311 : 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11312 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11313 : :
11314 : 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11315 : : {
11316 : : /* For now, we only support cases in which all loads and stores fall back
11317 : : to VnQI or none do. */
11318 : 0 : gcc_assert (!rgl->max_nscalars_per_iter
11319 : : || (rgl->factor == 1 && factor == 1)
11320 : : || (rgl->max_nscalars_per_iter * rgl->factor
11321 : : == nscalars_per_iter * factor));
11322 : 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
11323 : 0 : rgl->type = vectype;
11324 : 0 : rgl->factor = factor;
11325 : : }
11326 : 0 : }
11327 : :
11328 : : /* Given a complete set of lengths LENS, extract length number INDEX
11329 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11330 : : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11331 : : multipled by the number of elements that should be processed.
11332 : : Insert any set-up statements before GSI. */
11333 : :
11334 : : tree
11335 : 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11336 : : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11337 : : unsigned int index, unsigned int factor)
11338 : : {
11339 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
11340 : 0 : bool use_bias_adjusted_len =
11341 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11342 : :
11343 : : /* Populate the rgroup's len array, if this is the first time we've
11344 : : used it. */
11345 : 0 : if (rgl->controls.is_empty ())
11346 : : {
11347 : 0 : rgl->controls.safe_grow_cleared (nvectors, true);
11348 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
11349 : : {
11350 : 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11351 : 0 : gcc_assert (len_type != NULL_TREE);
11352 : :
11353 : 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11354 : :
11355 : : /* Provide a dummy definition until the real one is available. */
11356 : 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11357 : 0 : rgl->controls[i] = len;
11358 : :
11359 : 0 : if (use_bias_adjusted_len)
11360 : : {
11361 : 0 : gcc_assert (i == 0);
11362 : 0 : tree adjusted_len =
11363 : 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11364 : 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11365 : 0 : rgl->bias_adjusted_ctrl = adjusted_len;
11366 : : }
11367 : : }
11368 : : }
11369 : :
11370 : 0 : if (use_bias_adjusted_len)
11371 : 0 : return rgl->bias_adjusted_ctrl;
11372 : :
11373 : 0 : tree loop_len = rgl->controls[index];
11374 : 0 : if (rgl->factor == 1 && factor == 1)
11375 : : {
11376 : 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11377 : 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11378 : 0 : if (maybe_ne (nunits1, nunits2))
11379 : : {
11380 : : /* A loop len for data type X can be reused for data type Y
11381 : : if X has N times more elements than Y and if Y's elements
11382 : : are N times bigger than X's. */
11383 : 0 : gcc_assert (multiple_p (nunits1, nunits2));
11384 : 0 : factor = exact_div (nunits1, nunits2).to_constant ();
11385 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11386 : 0 : gimple_seq seq = NULL;
11387 : 0 : loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11388 : : build_int_cst (iv_type, factor));
11389 : 0 : if (seq)
11390 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11391 : : }
11392 : : }
11393 : : return loop_len;
11394 : : }
11395 : :
11396 : : /* Scale profiling counters by estimation for LOOP which is vectorized
11397 : : by factor VF.
11398 : : If FLAT is true, the loop we started with had unrealistically flat
11399 : : profile. */
11400 : :
11401 : : static void
11402 : 45294 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11403 : : {
11404 : : /* For flat profiles do not scale down proportionally by VF and only
11405 : : cap by known iteration count bounds. */
11406 : 45294 : if (flat)
11407 : : {
11408 : 24400 : if (dump_file && (dump_flags & TDF_DETAILS))
11409 : 4571 : fprintf (dump_file,
11410 : : "Vectorized loop profile seems flat; not scaling iteration "
11411 : : "count down by the vectorization factor %i\n", vf);
11412 : 24400 : scale_loop_profile (loop, profile_probability::always (),
11413 : : get_likely_max_loop_iterations_int (loop));
11414 : 24400 : return;
11415 : : }
11416 : : /* Loop body executes VF fewer times and exit increases VF times. */
11417 : 20894 : profile_count entry_count = loop_preheader_edge (loop)->count ();
11418 : :
11419 : : /* If we have unreliable loop profile avoid dropping entry
11420 : : count bellow header count. This can happen since loops
11421 : : has unrealistically low trip counts. */
11422 : 20894 : while (vf > 1
11423 : 21864 : && loop->header->count > entry_count
11424 : 44626 : && loop->header->count < entry_count * vf)
11425 : : {
11426 : 1868 : if (dump_file && (dump_flags & TDF_DETAILS))
11427 : 145 : fprintf (dump_file,
11428 : : "Vectorization factor %i seems too large for profile "
11429 : : "prevoiusly believed to be consistent; reducing.\n", vf);
11430 : 1868 : vf /= 2;
11431 : : }
11432 : :
11433 : 20894 : if (entry_count.nonzero_p ())
11434 : 20894 : set_edge_probability_and_rescale_others
11435 : 20894 : (exit_e,
11436 : 20894 : entry_count.probability_in (loop->header->count / vf));
11437 : : /* Avoid producing very large exit probability when we do not have
11438 : : sensible profile. */
11439 : 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
11440 : 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11441 : 20894 : loop->latch->count = single_pred_edge (loop->latch)->count ();
11442 : :
11443 : 20894 : scale_loop_profile (loop, profile_probability::always () / vf,
11444 : : get_likely_max_loop_iterations_int (loop));
11445 : : }
11446 : :
11447 : : /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11448 : : latch edge values originally defined by it. */
11449 : :
11450 : : static void
11451 : 255205 : maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11452 : : stmt_vec_info def_stmt_info)
11453 : : {
11454 : 262323 : tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11455 : 255205 : if (!def || TREE_CODE (def) != SSA_NAME)
11456 : 47613 : return;
11457 : 207592 : stmt_vec_info phi_info;
11458 : 207592 : imm_use_iterator iter;
11459 : 207592 : use_operand_p use_p;
11460 : 529055 : FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11461 : : {
11462 : 321463 : gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11463 : 321463 : if (!phi)
11464 : 281635 : continue;
11465 : 39828 : if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11466 : 20657 : && (phi_info = loop_vinfo->lookup_stmt (phi))
11467 : 20657 : && STMT_VINFO_RELEVANT_P (phi_info)))
11468 : 19171 : continue;
11469 : 20657 : loop_p loop = gimple_bb (phi)->loop_father;
11470 : 20657 : edge e = loop_latch_edge (loop);
11471 : 20657 : if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11472 : 216 : continue;
11473 : :
11474 : 20441 : if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11475 : 16978 : && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11476 : 16386 : && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11477 : : {
11478 : 16386 : vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11479 : 16386 : vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11480 : 49158 : gcc_assert (phi_defs.length () == latch_defs.length ());
11481 : 65850 : for (unsigned i = 0; i < phi_defs.length (); ++i)
11482 : 33078 : add_phi_arg (as_a <gphi *> (phi_defs[i]),
11483 : 16539 : gimple_get_lhs (latch_defs[i]), e,
11484 : 16539 : gimple_phi_arg_location (phi, e->dest_idx));
11485 : : }
11486 : 4055 : else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11487 : : {
11488 : : /* For first order recurrences we have to update both uses of
11489 : : the latch definition, the one in the PHI node and the one
11490 : : in the generated VEC_PERM_EXPR. */
11491 : 22 : vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11492 : 22 : vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11493 : 66 : gcc_assert (phi_defs.length () == latch_defs.length ());
11494 : 22 : tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11495 : 22 : gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11496 : 120 : for (unsigned i = 0; i < phi_defs.length (); ++i)
11497 : : {
11498 : 38 : gassign *perm = as_a <gassign *> (phi_defs[i]);
11499 : 38 : if (i > 0)
11500 : 16 : gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11501 : 38 : gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11502 : 38 : update_stmt (perm);
11503 : : }
11504 : 22 : add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11505 : 22 : gimple_phi_arg_location (phi, e->dest_idx));
11506 : : }
11507 : : }
11508 : : }
11509 : :
11510 : : /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11511 : : When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11512 : : stmt_vec_info. */
11513 : :
11514 : : static bool
11515 : 552546 : vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11516 : : gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11517 : : {
11518 : 552546 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11519 : 552546 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11520 : :
11521 : 552546 : if (dump_enabled_p ())
11522 : 122131 : dump_printf_loc (MSG_NOTE, vect_location,
11523 : : "------>vectorizing statement: %G", stmt_info->stmt);
11524 : :
11525 : 552546 : if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11526 : 148997 : vect_loop_kill_debug_uses (loop, stmt_info);
11527 : :
11528 : 552546 : if (!STMT_VINFO_RELEVANT_P (stmt_info)
11529 : 282943 : && !STMT_VINFO_LIVE_P (stmt_info))
11530 : : {
11531 : 282935 : if (is_gimple_call (stmt_info->stmt)
11532 : 282935 : && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11533 : : {
11534 : 3 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11535 : 3 : *seen_store = stmt_info;
11536 : 3 : return false;
11537 : : }
11538 : : return false;
11539 : : }
11540 : :
11541 : 269611 : if (STMT_VINFO_VECTYPE (stmt_info))
11542 : : {
11543 : 269602 : poly_uint64 nunits
11544 : 269602 : = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11545 : 269602 : if (!STMT_SLP_TYPE (stmt_info)
11546 : 234540 : && maybe_ne (nunits, vf)
11547 : 295717 : && dump_enabled_p ())
11548 : : /* For SLP VF is set according to unrolling factor, and not
11549 : : to vector size, hence for SLP this print is not valid. */
11550 : 8709 : dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11551 : : }
11552 : :
11553 : : /* Pure SLP statements have already been vectorized. We still need
11554 : : to apply loop vectorization to hybrid SLP statements. */
11555 : 269611 : if (PURE_SLP_STMT (stmt_info))
11556 : : return false;
11557 : :
11558 : 235004 : if (dump_enabled_p ())
11559 : 48488 : dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11560 : :
11561 : 235004 : if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11562 : 44441 : *seen_store = stmt_info;
11563 : :
11564 : : return true;
11565 : : }
11566 : :
11567 : : /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11568 : : in the hash_map with its corresponding values. */
11569 : :
11570 : : static tree
11571 : 3598 : find_in_mapping (tree t, void *context)
11572 : : {
11573 : 3598 : hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11574 : :
11575 : 3598 : tree *value = mapping->get (t);
11576 : 3598 : return value ? *value : t;
11577 : : }
11578 : :
11579 : : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11580 : : original loop that has now been vectorized.
11581 : :
11582 : : The inits of the data_references need to be advanced with the number of
11583 : : iterations of the main loop. This has been computed in vect_do_peeling and
11584 : : is stored in parameter ADVANCE. We first restore the data_references
11585 : : initial offset with the values recored in ORIG_DRS_INIT.
11586 : :
11587 : : Since the loop_vec_info of this EPILOGUE was constructed for the original
11588 : : loop, its stmt_vec_infos all point to the original statements. These need
11589 : : to be updated to point to their corresponding copies as well as the SSA_NAMES
11590 : : in their PATTERN_DEF_SEQs and RELATED_STMTs.
11591 : :
11592 : : The data_reference's connections also need to be updated. Their
11593 : : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11594 : : stmt_vec_infos, their statements need to point to their corresponding copy,
11595 : : if they are gather loads or scatter stores then their reference needs to be
11596 : : updated to point to its corresponding copy. */
11597 : :
11598 : : static void
11599 : 6279 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11600 : : {
11601 : 6279 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11602 : 6279 : auto_vec<gimple *> stmt_worklist;
11603 : 6279 : hash_map<tree,tree> mapping;
11604 : 6279 : gimple *orig_stmt, *new_stmt;
11605 : 6279 : gimple_stmt_iterator epilogue_gsi;
11606 : 6279 : gphi_iterator epilogue_phi_gsi;
11607 : 6279 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11608 : 6279 : basic_block *epilogue_bbs = get_loop_body (epilogue);
11609 : 6279 : unsigned i;
11610 : :
11611 : 6279 : free (LOOP_VINFO_BBS (epilogue_vinfo));
11612 : 6279 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11613 : :
11614 : : /* Advance data_reference's with the number of iterations of the previous
11615 : : loop and its prologue. */
11616 : 6279 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11617 : :
11618 : :
11619 : : /* The EPILOGUE loop is a copy of the original loop so they share the same
11620 : : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11621 : : point to the copied statements. We also create a mapping of all LHS' in
11622 : : the original loop and all the LHS' in the EPILOGUE and create worklists to
11623 : : update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11624 : 18837 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11625 : : {
11626 : 12558 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11627 : 32605 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11628 : : {
11629 : 20047 : new_stmt = epilogue_phi_gsi.phi ();
11630 : :
11631 : 20047 : gcc_assert (gimple_uid (new_stmt) > 0);
11632 : 20047 : stmt_vinfo
11633 : 20047 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11634 : :
11635 : 20047 : orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11636 : 20047 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11637 : :
11638 : 40094 : mapping.put (gimple_phi_result (orig_stmt),
11639 : 20047 : gimple_phi_result (new_stmt));
11640 : : /* PHI nodes can not have patterns or related statements. */
11641 : 20047 : gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11642 : : && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11643 : : }
11644 : :
11645 : 25116 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11646 : 126866 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11647 : : {
11648 : 114308 : new_stmt = gsi_stmt (epilogue_gsi);
11649 : 114308 : if (is_gimple_debug (new_stmt))
11650 : 20784 : continue;
11651 : :
11652 : 93524 : gcc_assert (gimple_uid (new_stmt) > 0);
11653 : 93524 : stmt_vinfo
11654 : 93524 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11655 : :
11656 : 93524 : orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11657 : 93524 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11658 : :
11659 : 93524 : if (tree old_lhs = gimple_get_lhs (orig_stmt))
11660 : 87176 : mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11661 : :
11662 : 93524 : if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11663 : : {
11664 : : gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11665 : : for (gimple_stmt_iterator gsi = gsi_start (seq);
11666 : 3835 : !gsi_end_p (gsi); gsi_next (&gsi))
11667 : 2434 : stmt_worklist.safe_push (gsi_stmt (gsi));
11668 : : }
11669 : :
11670 : 93524 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11671 : 93524 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11672 : : {
11673 : 1752 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11674 : 1752 : stmt_worklist.safe_push (stmt);
11675 : : /* Set BB such that the assert in
11676 : : 'get_initial_def_for_reduction' is able to determine that
11677 : : the BB of the related stmt is inside this loop. */
11678 : 1752 : gimple_set_bb (stmt,
11679 : : gimple_bb (new_stmt));
11680 : 1752 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11681 : 1752 : gcc_assert (related_vinfo == NULL
11682 : : || related_vinfo == stmt_vinfo);
11683 : : }
11684 : : }
11685 : : }
11686 : :
11687 : : /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11688 : : using the original main loop and thus need to be updated to refer to the
11689 : : cloned variables used in the epilogue. */
11690 : 15584 : for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11691 : : {
11692 : 4186 : gimple *stmt = stmt_worklist[i];
11693 : 4186 : tree *new_op;
11694 : :
11695 : 10759 : for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11696 : : {
11697 : 6573 : tree op = gimple_op (stmt, j);
11698 : 6573 : if ((new_op = mapping.get(op)))
11699 : 1495 : gimple_set_op (stmt, j, *new_op);
11700 : : else
11701 : : {
11702 : : /* PR92429: The last argument of simplify_replace_tree disables
11703 : : folding when replacing arguments. This is required as
11704 : : otherwise you might end up with different statements than the
11705 : : ones analyzed in vect_loop_analyze, leading to different
11706 : : vectorization. */
11707 : 5078 : op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11708 : : &find_in_mapping, &mapping, false);
11709 : 5078 : gimple_set_op (stmt, j, op);
11710 : : }
11711 : : }
11712 : : }
11713 : :
11714 : 6279 : struct data_reference *dr;
11715 : 6279 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11716 : 25985 : FOR_EACH_VEC_ELT (datarefs, i, dr)
11717 : : {
11718 : 19706 : orig_stmt = DR_STMT (dr);
11719 : 19706 : gcc_assert (gimple_uid (orig_stmt) > 0);
11720 : 19706 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11721 : : /* Data references for gather loads and scatter stores do not use the
11722 : : updated offset we set using ADVANCE. Instead we have to make sure the
11723 : : reference in the data references point to the corresponding copy of
11724 : : the original in the epilogue. Make sure to update both
11725 : : gather/scatters recognized by dataref analysis and also other
11726 : : refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11727 : 19706 : auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11728 : 19706 : if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11729 : 19297 : || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11730 : : {
11731 : 421 : DR_REF (dr)
11732 : 421 : = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11733 : : &find_in_mapping, &mapping);
11734 : 421 : DR_BASE_ADDRESS (dr)
11735 : 421 : = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11736 : : &find_in_mapping, &mapping);
11737 : : }
11738 : 19706 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11739 : 19706 : stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11740 : : }
11741 : :
11742 : 6279 : epilogue_vinfo->shared->datarefs_copy.release ();
11743 : 6279 : epilogue_vinfo->shared->save_datarefs ();
11744 : 6279 : }
11745 : :
11746 : : /* When vectorizing early break statements instructions that happen before
11747 : : the early break in the current BB need to be moved to after the early
11748 : : break. This function deals with that and assumes that any validity
11749 : : checks has already been performed.
11750 : :
11751 : : While moving the instructions if it encounters a VUSE or VDEF it then
11752 : : corrects the VUSES as it moves the statements along. GDEST is the location
11753 : : in which to insert the new statements. */
11754 : :
11755 : : static void
11756 : 1116 : move_early_exit_stmts (loop_vec_info loop_vinfo)
11757 : : {
11758 : 1116 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
11759 : :
11760 : 1116 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11761 : 890 : return;
11762 : :
11763 : : /* Move all stmts that need moving. */
11764 : 226 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11765 : 226 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
11766 : :
11767 : 226 : tree last_seen_vuse = NULL_TREE;
11768 : 564 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11769 : : {
11770 : : /* We have to update crossed degenerate virtual PHIs. Simply
11771 : : elide them. */
11772 : 338 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
11773 : : {
11774 : 4 : tree vdef = gimple_phi_result (vphi);
11775 : 4 : tree vuse = gimple_phi_arg_def (vphi, 0);
11776 : 4 : imm_use_iterator iter;
11777 : 4 : use_operand_p use_p;
11778 : 4 : gimple *use_stmt;
11779 : 12 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11780 : : {
11781 : 24 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11782 : 8 : SET_USE (use_p, vuse);
11783 : 4 : }
11784 : 4 : auto gsi = gsi_for_stmt (stmt);
11785 : 4 : remove_phi_node (&gsi, true);
11786 : 4 : last_seen_vuse = vuse;
11787 : 4 : continue;
11788 : 4 : }
11789 : :
11790 : : /* Check to see if statement is still required for vect or has been
11791 : : elided. */
11792 : 334 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11793 : 334 : if (!stmt_info)
11794 : 0 : continue;
11795 : :
11796 : 334 : if (dump_enabled_p ())
11797 : 159 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11798 : :
11799 : 334 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11800 : 334 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11801 : 668 : last_seen_vuse = gimple_vuse (stmt);
11802 : : }
11803 : :
11804 : : /* Update all the stmts with their new reaching VUSES. */
11805 : 659 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11806 : : {
11807 : 191 : if (dump_enabled_p ())
11808 : 155 : dump_printf_loc (MSG_NOTE, vect_location,
11809 : : "updating vuse to %T for load %G",
11810 : : last_seen_vuse, p);
11811 : 191 : gimple_set_vuse (p, last_seen_vuse);
11812 : 191 : update_stmt (p);
11813 : : }
11814 : :
11815 : : /* And update the LC PHIs on exits. */
11816 : 1142 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11817 : 464 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11818 : 240 : if (gphi *phi = get_virtual_phi (e->dest))
11819 : 466 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11820 : : }
11821 : :
11822 : : /* Function vect_transform_loop.
11823 : :
11824 : : The analysis phase has determined that the loop is vectorizable.
11825 : : Vectorize the loop - created vectorized stmts to replace the scalar
11826 : : stmts in the loop, and update the loop exit condition.
11827 : : Returns scalar epilogue loop if any. */
11828 : :
11829 : : class loop *
11830 : 45294 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11831 : : {
11832 : 45294 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11833 : 45294 : class loop *epilogue = NULL;
11834 : 45294 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11835 : 45294 : int nbbs = loop->num_nodes;
11836 : 45294 : int i;
11837 : 45294 : tree niters_vector = NULL_TREE;
11838 : 45294 : tree step_vector = NULL_TREE;
11839 : 45294 : tree niters_vector_mult_vf = NULL_TREE;
11840 : 45294 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11841 : 45294 : unsigned int lowest_vf = constant_lower_bound (vf);
11842 : 45294 : gimple *stmt;
11843 : 45294 : bool check_profitability = false;
11844 : 45294 : unsigned int th;
11845 : 45294 : bool flat = maybe_flat_loop_profile (loop);
11846 : :
11847 : 45294 : DUMP_VECT_SCOPE ("vec_transform_loop");
11848 : :
11849 : 45294 : loop_vinfo->shared->check_datarefs ();
11850 : :
11851 : : /* Use the more conservative vectorization threshold. If the number
11852 : : of iterations is constant assume the cost check has been performed
11853 : : by our caller. If the threshold makes all loops profitable that
11854 : : run at least the (estimated) vectorization factor number of times
11855 : : checking is pointless, too. */
11856 : 45294 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11857 : 45294 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11858 : : {
11859 : 9835 : if (dump_enabled_p ())
11860 : 101 : dump_printf_loc (MSG_NOTE, vect_location,
11861 : : "Profitability threshold is %d loop iterations.\n",
11862 : : th);
11863 : : check_profitability = true;
11864 : : }
11865 : :
11866 : : /* Make sure there exists a single-predecessor exit bb. Do this before
11867 : : versioning. */
11868 : 45294 : edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11869 : 90588 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11870 : : {
11871 : 9554 : split_loop_exit_edge (e, true);
11872 : 9554 : if (dump_enabled_p ())
11873 : 1674 : dump_printf (MSG_NOTE, "split exit edge\n");
11874 : : }
11875 : :
11876 : : /* Version the loop first, if required, so the profitability check
11877 : : comes first. */
11878 : :
11879 : 45294 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11880 : : {
11881 : 3103 : class loop *sloop
11882 : 3103 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11883 : 3103 : sloop->force_vectorize = false;
11884 : 3103 : check_profitability = false;
11885 : : }
11886 : :
11887 : : /* Make sure there exists a single-predecessor exit bb also on the
11888 : : scalar loop copy. Do this after versioning but before peeling
11889 : : so CFG structure is fine for both scalar and if-converted loop
11890 : : to make slpeel_duplicate_current_defs_from_edges face matched
11891 : : loop closed PHI nodes on the exit. */
11892 : 45294 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11893 : : {
11894 : 4290 : e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11895 : 8580 : if (! single_pred_p (e->dest))
11896 : : {
11897 : 4077 : split_loop_exit_edge (e, true);
11898 : 4077 : if (dump_enabled_p ())
11899 : 889 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11900 : : }
11901 : : }
11902 : :
11903 : 45294 : tree niters = vect_build_loop_niters (loop_vinfo);
11904 : 45294 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11905 : 45294 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11906 : 45294 : bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11907 : 45294 : tree advance;
11908 : 45294 : drs_init_vec orig_drs_init;
11909 : :
11910 : 45294 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11911 : : &step_vector, &niters_vector_mult_vf, th,
11912 : : check_profitability, niters_no_overflow,
11913 : : &advance);
11914 : 45294 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11915 : 45294 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11916 : : {
11917 : : /* Ifcvt duplicates loop preheader, loop body and produces an basic
11918 : : block after loop exit. We need to scale all that. */
11919 : 62 : basic_block preheader
11920 : 62 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11921 : 62 : preheader->count
11922 : : = preheader->count.apply_probability
11923 : 62 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11924 : 62 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11925 : : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11926 : 62 : LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11927 : : }
11928 : :
11929 : 45294 : if (niters_vector == NULL_TREE)
11930 : : {
11931 : 24590 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11932 : 24590 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11933 : 49954 : && known_eq (lowest_vf, vf))
11934 : : {
11935 : 24587 : niters_vector
11936 : 24587 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11937 : 24587 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11938 : 24587 : step_vector = build_one_cst (TREE_TYPE (niters));
11939 : : }
11940 : 780 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11941 : 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11942 : : &step_vector, niters_no_overflow);
11943 : : else
11944 : : /* vect_do_peeling subtracted the number of peeled prologue
11945 : : iterations from LOOP_VINFO_NITERS. */
11946 : 779 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11947 : : &niters_vector, &step_vector,
11948 : : niters_no_overflow);
11949 : : }
11950 : :
11951 : : /* 1) Make sure the loop header has exactly two entries
11952 : : 2) Make sure we have a preheader basic block. */
11953 : :
11954 : 45294 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11955 : :
11956 : 45294 : split_edge (loop_preheader_edge (loop));
11957 : :
11958 : 45294 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11959 : : /* This will deal with any possible peeling. */
11960 : 1 : vect_prepare_for_masked_peels (loop_vinfo);
11961 : :
11962 : : /* Handle any code motion that we need to for early-break vectorization after
11963 : : we've done peeling but just before we start vectorizing. */
11964 : 45294 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11965 : 1116 : move_early_exit_stmts (loop_vinfo);
11966 : :
11967 : : /* Schedule the SLP instances first, then handle loop vectorization
11968 : : below. */
11969 : 45294 : if (!loop_vinfo->slp_instances.is_empty ())
11970 : : {
11971 : 5066 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11972 : 5066 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11973 : : }
11974 : :
11975 : : /* FORNOW: the vectorizer supports only loops which body consist
11976 : : of one basic block (header + empty latch). When the vectorizer will
11977 : : support more involved loop forms, the order by which the BBs are
11978 : : traversed need to be reconsidered. */
11979 : :
11980 : 138435 : for (i = 0; i < nbbs; i++)
11981 : : {
11982 : 93141 : basic_block bb = bbs[i];
11983 : 93141 : stmt_vec_info stmt_info;
11984 : :
11985 : 278586 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11986 : 185445 : gsi_next (&si))
11987 : : {
11988 : 185445 : gphi *phi = si.phi ();
11989 : 185445 : if (dump_enabled_p ())
11990 : 38324 : dump_printf_loc (MSG_NOTE, vect_location,
11991 : : "------>vectorizing phi: %G", (gimple *) phi);
11992 : 185445 : stmt_info = loop_vinfo->lookup_stmt (phi);
11993 : 185445 : if (!stmt_info)
11994 : 43791 : continue;
11995 : :
11996 : 141654 : if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11997 : 33386 : vect_loop_kill_debug_uses (loop, stmt_info);
11998 : :
11999 : 141654 : if (!STMT_VINFO_RELEVANT_P (stmt_info)
12000 : 109427 : && !STMT_VINFO_LIVE_P (stmt_info))
12001 : 109427 : continue;
12002 : :
12003 : 32227 : if (STMT_VINFO_VECTYPE (stmt_info)
12004 : 32227 : && (maybe_ne
12005 : 36341 : (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12006 : 37769 : && dump_enabled_p ())
12007 : 1428 : dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12008 : :
12009 : 32227 : if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12010 : : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12011 : : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12012 : : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12013 : : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12014 : 32227 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12015 : 32227 : && ! PURE_SLP_STMT (stmt_info))
12016 : : {
12017 : 30575 : if (dump_enabled_p ())
12018 : 4619 : dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12019 : 30575 : vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12020 : : }
12021 : : }
12022 : :
12023 : 278586 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12024 : 185445 : gsi_next (&si))
12025 : : {
12026 : 185445 : gphi *phi = si.phi ();
12027 : 185445 : stmt_info = loop_vinfo->lookup_stmt (phi);
12028 : 185445 : if (!stmt_info)
12029 : 43791 : continue;
12030 : :
12031 : 141654 : if (!STMT_VINFO_RELEVANT_P (stmt_info)
12032 : 109427 : && !STMT_VINFO_LIVE_P (stmt_info))
12033 : 109427 : continue;
12034 : :
12035 : 32227 : if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12036 : : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12037 : : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12038 : : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12039 : : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12040 : 32227 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12041 : 32227 : && ! PURE_SLP_STMT (stmt_info))
12042 : 30575 : maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12043 : : }
12044 : :
12045 : 186282 : for (gimple_stmt_iterator si = gsi_start_bb (bb);
12046 : 887609 : !gsi_end_p (si);)
12047 : : {
12048 : 794468 : stmt = gsi_stmt (si);
12049 : : /* During vectorization remove existing clobber stmts. */
12050 : 794468 : if (gimple_clobber_p (stmt))
12051 : : {
12052 : 56 : unlink_stmt_vdef (stmt);
12053 : 56 : gsi_remove (&si, true);
12054 : 56 : release_defs (stmt);
12055 : : }
12056 : : else
12057 : : {
12058 : : /* Ignore vector stmts created in the outer loop. */
12059 : 794412 : stmt_info = loop_vinfo->lookup_stmt (stmt);
12060 : :
12061 : : /* vector stmts created in the outer-loop during vectorization of
12062 : : stmts in an inner-loop may not have a stmt_info, and do not
12063 : : need to be vectorized. */
12064 : 794412 : stmt_vec_info seen_store = NULL;
12065 : 794412 : if (stmt_info)
12066 : : {
12067 : 529900 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12068 : : {
12069 : 20901 : gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12070 : 20901 : for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12071 : 43547 : !gsi_end_p (subsi); gsi_next (&subsi))
12072 : : {
12073 : 22646 : stmt_vec_info pat_stmt_info
12074 : 22646 : = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12075 : 22646 : vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12076 : : &si, &seen_store);
12077 : : }
12078 : 20901 : stmt_vec_info pat_stmt_info
12079 : : = STMT_VINFO_RELATED_STMT (stmt_info);
12080 : 20901 : if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12081 : : &si, &seen_store))
12082 : 7118 : maybe_set_vectorized_backedge_value (loop_vinfo,
12083 : : pat_stmt_info);
12084 : : }
12085 : : else
12086 : : {
12087 : 508999 : if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12088 : : &seen_store))
12089 : 217512 : maybe_set_vectorized_backedge_value (loop_vinfo,
12090 : : stmt_info);
12091 : : }
12092 : : }
12093 : 794412 : gsi_next (&si);
12094 : 794412 : if (seen_store)
12095 : : {
12096 : 44444 : if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12097 : : /* Interleaving. If IS_STORE is TRUE, the
12098 : : vectorization of the interleaving chain was
12099 : : completed - free all the stores in the chain. */
12100 : 1224 : vect_remove_stores (loop_vinfo,
12101 : : DR_GROUP_FIRST_ELEMENT (seen_store));
12102 : : else
12103 : : /* Free the attached stmt_vec_info and remove the stmt. */
12104 : 43220 : loop_vinfo->remove_stmt (stmt_info);
12105 : : }
12106 : : }
12107 : : }
12108 : :
12109 : : /* Stub out scalar statements that must not survive vectorization.
12110 : : Doing this here helps with grouped statements, or statements that
12111 : : are involved in patterns. */
12112 : 186282 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12113 : 1248335 : !gsi_end_p (gsi); gsi_next (&gsi))
12114 : : {
12115 : 1155194 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12116 : 6710 : if (!call || !gimple_call_internal_p (call))
12117 : 1149590 : continue;
12118 : 5604 : internal_fn ifn = gimple_call_internal_fn (call);
12119 : 5604 : if (ifn == IFN_MASK_LOAD)
12120 : : {
12121 : 1148 : tree lhs = gimple_get_lhs (call);
12122 : 1148 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12123 : : {
12124 : 597 : tree zero = build_zero_cst (TREE_TYPE (lhs));
12125 : 597 : gimple *new_stmt = gimple_build_assign (lhs, zero);
12126 : 597 : gsi_replace (&gsi, new_stmt, true);
12127 : : }
12128 : : }
12129 : 4456 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12130 : : {
12131 : 1674 : tree lhs = gimple_get_lhs (call);
12132 : 1674 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12133 : : {
12134 : 234 : tree else_arg
12135 : 234 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12136 : 234 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12137 : 234 : gsi_replace (&gsi, new_stmt, true);
12138 : : }
12139 : : }
12140 : : }
12141 : : } /* BBs in loop */
12142 : :
12143 : : /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12144 : : a zero NITERS becomes a nonzero NITERS_VECTOR. */
12145 : 45294 : if (integer_onep (step_vector))
12146 : 45289 : niters_no_overflow = true;
12147 : 45294 : vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12148 : : niters_vector, step_vector, niters_vector_mult_vf,
12149 : 45294 : !niters_no_overflow);
12150 : :
12151 : 45294 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12152 : :
12153 : : /* True if the final iteration might not handle a full vector's
12154 : : worth of scalar iterations. */
12155 : 90588 : bool final_iter_may_be_partial
12156 : 45294 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
12157 : 45294 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
12158 : :
12159 : : /* +1 to convert latch counts to loop iteration counts. */
12160 : 45294 : int bias_for_lowest = 1;
12161 : :
12162 : : /* When we are peeling for gaps then we take away one scalar iteration
12163 : : from the vector loop. Thus we can adjust the upper bound by one
12164 : : scalar iteration. But only when we know the bound applies to the
12165 : : IV exit test which might not be true when we have multiple exits. */
12166 : 45294 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12167 : 87880 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12168 : :
12169 : 45294 : int bias_for_assumed = bias_for_lowest;
12170 : 45294 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12171 : 45294 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12172 : : {
12173 : : /* When the amount of peeling is known at compile time, the first
12174 : : iteration will have exactly alignment_npeels active elements.
12175 : : In the worst case it will have at least one. */
12176 : 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12177 : 1 : bias_for_lowest += lowest_vf - min_first_active;
12178 : 1 : bias_for_assumed += assumed_vf - min_first_active;
12179 : : }
12180 : : /* In these calculations the "- 1" converts loop iteration counts
12181 : : back to latch counts. */
12182 : 45294 : if (loop->any_upper_bound)
12183 : : {
12184 : 45294 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12185 : 45294 : loop->nb_iterations_upper_bound
12186 : 45294 : = (final_iter_may_be_partial
12187 : 46415 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12188 : 2242 : lowest_vf) - 1
12189 : 44173 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12190 : 88346 : lowest_vf) - 1);
12191 : 45294 : if (main_vinfo
12192 : : /* Both peeling for alignment and peeling for gaps can end up
12193 : : with the scalar epilogue running for more than VF-1 iterations. */
12194 : 6279 : && !main_vinfo->peeling_for_alignment
12195 : 6224 : && !main_vinfo->peeling_for_gaps)
12196 : : {
12197 : 6078 : unsigned int bound;
12198 : 6078 : poly_uint64 main_iters
12199 : 6078 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12200 : 6078 : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12201 : 6078 : main_iters
12202 : 6078 : = upper_bound (main_iters,
12203 : 6078 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12204 : 12156 : if (can_div_away_from_zero_p (main_iters,
12205 : 6078 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12206 : : &bound))
12207 : 6078 : loop->nb_iterations_upper_bound
12208 : 6078 : = wi::umin ((bound_wide_int) (bound - 1),
12209 : 6078 : loop->nb_iterations_upper_bound);
12210 : : }
12211 : : }
12212 : 45294 : if (loop->any_likely_upper_bound)
12213 : 45294 : loop->nb_iterations_likely_upper_bound
12214 : 45294 : = (final_iter_may_be_partial
12215 : 46415 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12216 : 1121 : + bias_for_lowest, lowest_vf) - 1
12217 : 44173 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12218 : 45294 : + bias_for_lowest, lowest_vf) - 1);
12219 : 45294 : if (loop->any_estimate)
12220 : 28592 : loop->nb_iterations_estimate
12221 : 28592 : = (final_iter_may_be_partial
12222 : 29289 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12223 : 1394 : assumed_vf) - 1
12224 : 27895 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12225 : 56487 : assumed_vf) - 1);
12226 : 45294 : scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12227 : : assumed_vf, flat);
12228 : :
12229 : 45294 : if (dump_enabled_p ())
12230 : : {
12231 : 9645 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12232 : : {
12233 : 8384 : dump_printf_loc (MSG_NOTE, vect_location,
12234 : : "LOOP VECTORIZED\n");
12235 : 8384 : if (loop->inner)
12236 : 264 : dump_printf_loc (MSG_NOTE, vect_location,
12237 : : "OUTER LOOP VECTORIZED\n");
12238 : 8384 : dump_printf (MSG_NOTE, "\n");
12239 : : }
12240 : : else
12241 : 1261 : dump_printf_loc (MSG_NOTE, vect_location,
12242 : : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12243 : 1261 : GET_MODE_NAME (loop_vinfo->vector_mode));
12244 : : }
12245 : :
12246 : : /* Loops vectorized with a variable factor won't benefit from
12247 : : unrolling/peeling. */
12248 : : if (!vf.is_constant ())
12249 : : {
12250 : : loop->unroll = 1;
12251 : : if (dump_enabled_p ())
12252 : : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12253 : : " variable-length vectorization factor\n");
12254 : : }
12255 : : /* Free SLP instances here because otherwise stmt reference counting
12256 : : won't work. */
12257 : : slp_instance instance;
12258 : 51020 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12259 : 5726 : vect_free_slp_instance (instance);
12260 : 45294 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12261 : : /* Clear-up safelen field since its value is invalid after vectorization
12262 : : since vectorized loop can have loop-carried dependencies. */
12263 : 45294 : loop->safelen = 0;
12264 : :
12265 : 45294 : if (epilogue)
12266 : : {
12267 : 6279 : update_epilogue_loop_vinfo (epilogue, advance);
12268 : :
12269 : 6279 : epilogue->simduid = loop->simduid;
12270 : 6279 : epilogue->force_vectorize = loop->force_vectorize;
12271 : 6279 : epilogue->dont_vectorize = false;
12272 : : }
12273 : :
12274 : 45294 : return epilogue;
12275 : 45294 : }
12276 : :
12277 : : /* The code below is trying to perform simple optimization - revert
12278 : : if-conversion for masked stores, i.e. if the mask of a store is zero
12279 : : do not perform it and all stored value producers also if possible.
12280 : : For example,
12281 : : for (i=0; i<n; i++)
12282 : : if (c[i])
12283 : : {
12284 : : p1[i] += 1;
12285 : : p2[i] = p3[i] +2;
12286 : : }
12287 : : this transformation will produce the following semi-hammock:
12288 : :
12289 : : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12290 : : {
12291 : : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12292 : : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12293 : : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12294 : : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12295 : : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12296 : : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12297 : : }
12298 : : */
12299 : :
12300 : : void
12301 : 743 : optimize_mask_stores (class loop *loop)
12302 : : {
12303 : 743 : basic_block *bbs = get_loop_body (loop);
12304 : 743 : unsigned nbbs = loop->num_nodes;
12305 : 743 : unsigned i;
12306 : 743 : basic_block bb;
12307 : 743 : class loop *bb_loop;
12308 : 743 : gimple_stmt_iterator gsi;
12309 : 743 : gimple *stmt;
12310 : 743 : auto_vec<gimple *> worklist;
12311 : 743 : auto_purge_vect_location sentinel;
12312 : :
12313 : 743 : vect_location = find_loop_location (loop);
12314 : : /* Pick up all masked stores in loop if any. */
12315 : 2229 : for (i = 0; i < nbbs; i++)
12316 : : {
12317 : 1486 : bb = bbs[i];
12318 : 27918 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12319 : 24946 : gsi_next (&gsi))
12320 : : {
12321 : 24946 : stmt = gsi_stmt (gsi);
12322 : 24946 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12323 : 1134 : worklist.safe_push (stmt);
12324 : : }
12325 : : }
12326 : :
12327 : 743 : free (bbs);
12328 : 743 : if (worklist.is_empty ())
12329 : 72 : return;
12330 : :
12331 : : /* Loop has masked stores. */
12332 : 1787 : while (!worklist.is_empty ())
12333 : : {
12334 : 1116 : gimple *last, *last_store;
12335 : 1116 : edge e, efalse;
12336 : 1116 : tree mask;
12337 : 1116 : basic_block store_bb, join_bb;
12338 : 1116 : gimple_stmt_iterator gsi_to;
12339 : 1116 : tree vdef, new_vdef;
12340 : 1116 : gphi *phi;
12341 : 1116 : tree vectype;
12342 : 1116 : tree zero;
12343 : :
12344 : 1116 : last = worklist.pop ();
12345 : 1116 : mask = gimple_call_arg (last, 2);
12346 : 1116 : bb = gimple_bb (last);
12347 : : /* Create then_bb and if-then structure in CFG, then_bb belongs to
12348 : : the same loop as if_bb. It could be different to LOOP when two
12349 : : level loop-nest is vectorized and mask_store belongs to the inner
12350 : : one. */
12351 : 1116 : e = split_block (bb, last);
12352 : 1116 : bb_loop = bb->loop_father;
12353 : 1116 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12354 : 1116 : join_bb = e->dest;
12355 : 1116 : store_bb = create_empty_bb (bb);
12356 : 1116 : add_bb_to_loop (store_bb, bb_loop);
12357 : 1116 : e->flags = EDGE_TRUE_VALUE;
12358 : 1116 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12359 : : /* Put STORE_BB to likely part. */
12360 : 1116 : efalse->probability = profile_probability::likely ();
12361 : 1116 : e->probability = efalse->probability.invert ();
12362 : 1116 : store_bb->count = efalse->count ();
12363 : 1116 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12364 : 1116 : if (dom_info_available_p (CDI_DOMINATORS))
12365 : 1116 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12366 : 1116 : if (dump_enabled_p ())
12367 : 339 : dump_printf_loc (MSG_NOTE, vect_location,
12368 : : "Create new block %d to sink mask stores.",
12369 : : store_bb->index);
12370 : : /* Create vector comparison with boolean result. */
12371 : 1116 : vectype = TREE_TYPE (mask);
12372 : 1116 : zero = build_zero_cst (vectype);
12373 : 1116 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12374 : 1116 : gsi = gsi_last_bb (bb);
12375 : 1116 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12376 : : /* Create new PHI node for vdef of the last masked store:
12377 : : .MEM_2 = VDEF <.MEM_1>
12378 : : will be converted to
12379 : : .MEM.3 = VDEF <.MEM_1>
12380 : : and new PHI node will be created in join bb
12381 : : .MEM_2 = PHI <.MEM_1, .MEM_3>
12382 : : */
12383 : 1116 : vdef = gimple_vdef (last);
12384 : 1116 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
12385 : 1116 : gimple_set_vdef (last, new_vdef);
12386 : 1116 : phi = create_phi_node (vdef, join_bb);
12387 : 1116 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12388 : :
12389 : : /* Put all masked stores with the same mask to STORE_BB if possible. */
12390 : 1152 : while (true)
12391 : : {
12392 : 1134 : gimple_stmt_iterator gsi_from;
12393 : 1134 : gimple *stmt1 = NULL;
12394 : :
12395 : : /* Move masked store to STORE_BB. */
12396 : 1134 : last_store = last;
12397 : 1134 : gsi = gsi_for_stmt (last);
12398 : 1134 : gsi_from = gsi;
12399 : : /* Shift GSI to the previous stmt for further traversal. */
12400 : 1134 : gsi_prev (&gsi);
12401 : 1134 : gsi_to = gsi_start_bb (store_bb);
12402 : 1134 : gsi_move_before (&gsi_from, &gsi_to);
12403 : : /* Setup GSI_TO to the non-empty block start. */
12404 : 1134 : gsi_to = gsi_start_bb (store_bb);
12405 : 1134 : if (dump_enabled_p ())
12406 : 355 : dump_printf_loc (MSG_NOTE, vect_location,
12407 : : "Move stmt to created bb\n%G", last);
12408 : : /* Move all stored value producers if possible. */
12409 : 6228 : while (!gsi_end_p (gsi))
12410 : : {
12411 : 6226 : tree lhs;
12412 : 6226 : imm_use_iterator imm_iter;
12413 : 6226 : use_operand_p use_p;
12414 : 6226 : bool res;
12415 : :
12416 : : /* Skip debug statements. */
12417 : 6226 : if (is_gimple_debug (gsi_stmt (gsi)))
12418 : : {
12419 : 1 : gsi_prev (&gsi);
12420 : 3633 : continue;
12421 : : }
12422 : 6225 : stmt1 = gsi_stmt (gsi);
12423 : : /* Do not consider statements writing to memory or having
12424 : : volatile operand. */
12425 : 12420 : if (gimple_vdef (stmt1)
12426 : 12420 : || gimple_has_volatile_ops (stmt1))
12427 : : break;
12428 : 6195 : gsi_from = gsi;
12429 : 6195 : gsi_prev (&gsi);
12430 : 6195 : lhs = gimple_get_lhs (stmt1);
12431 : 6195 : if (!lhs)
12432 : : break;
12433 : :
12434 : : /* LHS of vectorized stmt must be SSA_NAME. */
12435 : 6195 : if (TREE_CODE (lhs) != SSA_NAME)
12436 : : break;
12437 : :
12438 : 6195 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12439 : : {
12440 : : /* Remove dead scalar statement. */
12441 : 4010 : if (has_zero_uses (lhs))
12442 : : {
12443 : 3632 : gsi_remove (&gsi_from, true);
12444 : 3632 : continue;
12445 : : }
12446 : : }
12447 : :
12448 : : /* Check that LHS does not have uses outside of STORE_BB. */
12449 : 2563 : res = true;
12450 : 4108 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12451 : : {
12452 : 2647 : gimple *use_stmt;
12453 : 2647 : use_stmt = USE_STMT (use_p);
12454 : 2647 : if (is_gimple_debug (use_stmt))
12455 : 0 : continue;
12456 : 2647 : if (gimple_bb (use_stmt) != store_bb)
12457 : : {
12458 : : res = false;
12459 : : break;
12460 : : }
12461 : : }
12462 : 2563 : if (!res)
12463 : : break;
12464 : :
12465 : 1461 : if (gimple_vuse (stmt1)
12466 : 2221 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
12467 : : break;
12468 : :
12469 : : /* Can move STMT1 to STORE_BB. */
12470 : 1461 : if (dump_enabled_p ())
12471 : 539 : dump_printf_loc (MSG_NOTE, vect_location,
12472 : : "Move stmt to created bb\n%G", stmt1);
12473 : 1461 : gsi_move_before (&gsi_from, &gsi_to);
12474 : : /* Shift GSI_TO for further insertion. */
12475 : 2922 : gsi_prev (&gsi_to);
12476 : : }
12477 : : /* Put other masked stores with the same mask to STORE_BB. */
12478 : 1134 : if (worklist.is_empty ()
12479 : 463 : || gimple_call_arg (worklist.last (), 2) != mask
12480 : 18 : || worklist.last () != stmt1)
12481 : : break;
12482 : 18 : last = worklist.pop ();
12483 : 18 : }
12484 : 2232 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12485 : : }
12486 : 743 : }
12487 : :
12488 : : /* Decide whether it is possible to use a zero-based induction variable
12489 : : when vectorizing LOOP_VINFO with partial vectors. If it is, return
12490 : : the value that the induction variable must be able to hold in order
12491 : : to ensure that the rgroups eventually have no active vector elements.
12492 : : Return -1 otherwise. */
12493 : :
12494 : : widest_int
12495 : 24 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12496 : : {
12497 : 24 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12498 : 24 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12499 : 24 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12500 : :
12501 : : /* Calculate the value that the induction variable must be able
12502 : : to hit in order to ensure that we end the loop with an all-false mask.
12503 : : This involves adding the maximum number of inactive trailing scalar
12504 : : iterations. */
12505 : 24 : widest_int iv_limit = -1;
12506 : 24 : if (max_loop_iterations (loop, &iv_limit))
12507 : : {
12508 : 24 : if (niters_skip)
12509 : : {
12510 : : /* Add the maximum number of skipped iterations to the
12511 : : maximum iteration count. */
12512 : 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
12513 : 0 : iv_limit += wi::to_widest (niters_skip);
12514 : : else
12515 : 0 : iv_limit += max_vf - 1;
12516 : : }
12517 : 24 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12518 : : /* Make a conservatively-correct assumption. */
12519 : 2 : iv_limit += max_vf - 1;
12520 : :
12521 : : /* IV_LIMIT is the maximum number of latch iterations, which is also
12522 : : the maximum in-range IV value. Round this value down to the previous
12523 : : vector alignment boundary and then add an extra full iteration. */
12524 : 24 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12525 : 24 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12526 : : }
12527 : 24 : return iv_limit;
12528 : : }
12529 : :
12530 : : /* For the given rgroup_controls RGC, check whether an induction variable
12531 : : would ever hit a value that produces a set of all-false masks or zero
12532 : : lengths before wrapping around. Return true if it's possible to wrap
12533 : : around before hitting the desirable value, otherwise return false. */
12534 : :
12535 : : bool
12536 : 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12537 : : {
12538 : 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12539 : :
12540 : 0 : if (iv_limit == -1)
12541 : : return true;
12542 : :
12543 : 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12544 : 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
12545 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12546 : :
12547 : 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12548 : : return true;
12549 : :
12550 : : return false;
12551 : 0 : }
|