Branch data Line data Source code
1 : : /* Loop Vectorization
2 : : Copyright (C) 2003-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 : : Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #define INCLUDE_ALGORITHM
23 : : #include "config.h"
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "cfghooks.h"
32 : : #include "tree-pass.h"
33 : : #include "ssa.h"
34 : : #include "optabs-tree.h"
35 : : #include "memmodel.h"
36 : : #include "optabs.h"
37 : : #include "diagnostic-core.h"
38 : : #include "fold-const.h"
39 : : #include "stor-layout.h"
40 : : #include "cfganal.h"
41 : : #include "gimplify.h"
42 : : #include "gimple-iterator.h"
43 : : #include "gimplify-me.h"
44 : : #include "tree-ssa-loop-ivopts.h"
45 : : #include "tree-ssa-loop-manip.h"
46 : : #include "tree-ssa-loop-niter.h"
47 : : #include "tree-ssa-loop.h"
48 : : #include "cfgloop.h"
49 : : #include "tree-scalar-evolution.h"
50 : : #include "tree-vectorizer.h"
51 : : #include "gimple-fold.h"
52 : : #include "cgraph.h"
53 : : #include "tree-cfg.h"
54 : : #include "tree-if-conv.h"
55 : : #include "internal-fn.h"
56 : : #include "tree-vector-builder.h"
57 : : #include "vec-perm-indices.h"
58 : : #include "tree-eh.h"
59 : : #include "case-cfn-macros.h"
60 : : #include "langhooks.h"
61 : :
62 : : /* Loop Vectorization Pass.
63 : :
64 : : This pass tries to vectorize loops.
65 : :
66 : : For example, the vectorizer transforms the following simple loop:
67 : :
68 : : short a[N]; short b[N]; short c[N]; int i;
69 : :
70 : : for (i=0; i<N; i++){
71 : : a[i] = b[i] + c[i];
72 : : }
73 : :
74 : : as if it was manually vectorized by rewriting the source code into:
75 : :
76 : : typedef int __attribute__((mode(V8HI))) v8hi;
77 : : short a[N]; short b[N]; short c[N]; int i;
78 : : v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 : : v8hi va, vb, vc;
80 : :
81 : : for (i=0; i<N/8; i++){
82 : : vb = pb[i];
83 : : vc = pc[i];
84 : : va = vb + vc;
85 : : pa[i] = va;
86 : : }
87 : :
88 : : The main entry to this pass is vectorize_loops(), in which
89 : : the vectorizer applies a set of analyses on a given set of loops,
90 : : followed by the actual vectorization transformation for the loops that
91 : : had successfully passed the analysis phase.
92 : : Throughout this pass we make a distinction between two types of
93 : : data: scalars (which are represented by SSA_NAMES), and memory references
94 : : ("data-refs"). These two types of data require different handling both
95 : : during analysis and transformation. The types of data-refs that the
96 : : vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 : : (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 : : accesses are required to have a simple (consecutive) access pattern.
99 : :
100 : : Analysis phase:
101 : : ===============
102 : : The driver for the analysis phase is vect_analyze_loop().
103 : : It applies a set of analyses, some of which rely on the scalar evolution
104 : : analyzer (scev) developed by Sebastian Pop.
105 : :
106 : : During the analysis phase the vectorizer records some information
107 : : per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 : : loop, as well as general information about the loop as a whole, which is
109 : : recorded in a "loop_vec_info" struct attached to each loop.
110 : :
111 : : Transformation phase:
112 : : =====================
113 : : The loop transformation phase scans all the stmts in the loop, and
114 : : creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 : : the loop that needs to be vectorized. It inserts the vector code sequence
116 : : just before the scalar stmt S, and records a pointer to the vector code
117 : : in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 : : attached to S). This pointer will be used for the vectorization of following
119 : : stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 : : otherwise, we rely on dead code elimination for removing it.
121 : :
122 : : For example, say stmt S1 was vectorized into stmt VS1:
123 : :
124 : : VS1: vb = px[i];
125 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 : : S2: a = b;
127 : :
128 : : To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 : : the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 : : vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 : : resulting sequence would be:
132 : :
133 : : VS1: vb = px[i];
134 : : S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 : : VS2: va = vb;
136 : : S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
137 : :
138 : : Operands that are not SSA_NAMEs, are data-refs that appear in
139 : : load/store operations (like 'x[i]' in S1), and are handled differently.
140 : :
141 : : Target modeling:
142 : : =================
143 : : Currently the only target specific information that is used is the
144 : : size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 : : Targets that can support different sizes of vectors, for now will need
146 : : to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 : : flexibility will be added in the future.
148 : :
149 : : Since we only vectorize operations which vector form can be
150 : : expressed using existing tree codes, to verify that an operation is
151 : : supported, the vectorizer checks the relevant optab at the relevant
152 : : machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 : : the value found is CODE_FOR_nothing, then there's no target support, and
154 : : we can't vectorize the stmt.
155 : :
156 : : For additional information on this project see:
157 : : http://gcc.gnu.org/projects/tree-ssa/vectorization.html
158 : : */
159 : :
160 : : static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 : : unsigned *);
162 : : static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 : : bool *, bool *, bool);
164 : :
165 : : /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 : : statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 : : may already be set for general statements (not just data refs). */
168 : :
169 : : static opt_result
170 : 3269379 : vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 : : bool vectype_maybe_set_p,
172 : : poly_uint64 *vf)
173 : : {
174 : 3269379 : gimple *stmt = stmt_info->stmt;
175 : :
176 : 3269379 : if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 : 1597529 : && !STMT_VINFO_LIVE_P (stmt_info))
178 : 3269474 : || gimple_clobber_p (stmt))
179 : : {
180 : 1597434 : if (dump_enabled_p ())
181 : 123805 : dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 : 1597434 : return opt_result::success ();
183 : : }
184 : :
185 : 1671945 : tree stmt_vectype, nunits_vectype;
186 : 1671945 : opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 : : &stmt_vectype,
188 : : &nunits_vectype);
189 : 1671945 : if (!res)
190 : 2361 : return res;
191 : :
192 : 1669584 : if (stmt_vectype)
193 : : {
194 : 1669558 : if (STMT_VINFO_VECTYPE (stmt_info))
195 : : /* The only case when a vectype had been already set is for stmts
196 : : that contain a data ref, or for "pattern-stmts" (stmts generated
197 : : by the vectorizer to represent/replace a certain idiom). */
198 : 990072 : gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 : : || vectype_maybe_set_p)
200 : : && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 : : else
202 : 679486 : STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
203 : : }
204 : :
205 : 1669584 : if (nunits_vectype)
206 : 1669558 : vect_update_max_nunits (vf, nunits_vectype);
207 : :
208 : 1669584 : return opt_result::success ();
209 : : }
210 : :
211 : : /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 : : types of STMT_INFO and all attached pattern statements and update
213 : : the vectorization factor VF accordingly. Return true on success
214 : : or false if something prevented vectorization. */
215 : :
216 : : static opt_result
217 : 2656279 : vect_determine_vf_for_stmt (vec_info *vinfo,
218 : : stmt_vec_info stmt_info, poly_uint64 *vf)
219 : : {
220 : 2656279 : if (dump_enabled_p ())
221 : 216310 : dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 : : stmt_info->stmt);
223 : 2656279 : opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 : 2656279 : if (!res)
225 : 2361 : return res;
226 : :
227 : 2653918 : if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 : 297563 : && STMT_VINFO_RELATED_STMT (stmt_info))
229 : : {
230 : 297563 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 : 297563 : stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
232 : :
233 : : /* If a pattern statement has def stmts, analyze them too. */
234 : 297563 : for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 : 613100 : !gsi_end_p (si); gsi_next (&si))
236 : : {
237 : 315537 : stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 : 315537 : if (dump_enabled_p ())
239 : 22146 : dump_printf_loc (MSG_NOTE, vect_location,
240 : : "==> examining pattern def stmt: %G",
241 : : def_stmt_info->stmt);
242 : 315537 : res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 : 315537 : if (!res)
244 : 0 : return res;
245 : : }
246 : :
247 : 297563 : if (dump_enabled_p ())
248 : 17510 : dump_printf_loc (MSG_NOTE, vect_location,
249 : : "==> examining pattern statement: %G",
250 : : stmt_info->stmt);
251 : 297563 : res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 : 297563 : if (!res)
253 : 0 : return res;
254 : : }
255 : :
256 : 2653918 : return opt_result::success ();
257 : : }
258 : :
259 : : /* Function vect_determine_vectorization_factor
260 : :
261 : : Determine the vectorization factor (VF). VF is the number of data elements
262 : : that are operated upon in parallel in a single iteration of the vectorized
263 : : loop. For example, when vectorizing a loop that operates on 4byte elements,
264 : : on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 : : elements can fit in a single vector register.
266 : :
267 : : We currently support vectorization of loops in which all types operated upon
268 : : are of the same size. Therefore this function currently sets VF according to
269 : : the size of the types operated upon, and fails if there are multiple sizes
270 : : in the loop.
271 : :
272 : : VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 : : original loop:
274 : : for (i=0; i<N; i++){
275 : : a[i] = b[i] + c[i];
276 : : }
277 : :
278 : : vectorized loop:
279 : : for (i=0; i<N; i+=VF){
280 : : a[i:VF] = b[i:VF] + c[i:VF];
281 : : }
282 : : */
283 : :
284 : : static opt_result
285 : 286828 : vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
286 : : {
287 : 286828 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 : 286828 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 : 286828 : unsigned nbbs = loop->num_nodes;
290 : 286828 : poly_uint64 vectorization_factor = 1;
291 : 286828 : tree scalar_type = NULL_TREE;
292 : 286828 : gphi *phi;
293 : 286828 : tree vectype;
294 : 286828 : stmt_vec_info stmt_info;
295 : 286828 : unsigned i;
296 : :
297 : 286828 : DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
298 : :
299 : 944539 : for (i = 0; i < nbbs; i++)
300 : : {
301 : 679975 : basic_block bb = bbs[i];
302 : :
303 : 1371001 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 : 691026 : gsi_next (&si))
305 : : {
306 : 710929 : phi = si.phi ();
307 : 710929 : stmt_info = loop_vinfo->lookup_stmt (phi);
308 : 710929 : if (dump_enabled_p ())
309 : 53006 : dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 : : (gimple *) phi);
311 : :
312 : 710929 : gcc_assert (stmt_info);
313 : :
314 : 710929 : if (STMT_VINFO_RELEVANT_P (stmt_info)
315 : 401137 : || STMT_VINFO_LIVE_P (stmt_info))
316 : : {
317 : 309792 : gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 : 309792 : scalar_type = TREE_TYPE (PHI_RESULT (phi));
319 : :
320 : 309792 : if (dump_enabled_p ())
321 : 11104 : dump_printf_loc (MSG_NOTE, vect_location,
322 : : "get vectype for scalar type: %T\n",
323 : : scalar_type);
324 : :
325 : 309792 : vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 : 309792 : if (!vectype)
327 : 19903 : return opt_result::failure_at (phi,
328 : : "not vectorized: unsupported "
329 : : "data-type %T\n",
330 : : scalar_type);
331 : 289889 : STMT_VINFO_VECTYPE (stmt_info) = vectype;
332 : :
333 : 289889 : if (dump_enabled_p ())
334 : 11038 : dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 : : vectype);
336 : :
337 : 289889 : if (dump_enabled_p ())
338 : : {
339 : 11038 : dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 : 11038 : dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 : 11038 : dump_printf (MSG_NOTE, "\n");
342 : : }
343 : :
344 : 289889 : vect_update_max_nunits (&vectorization_factor, vectype);
345 : : }
346 : : }
347 : :
348 : 5194956 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 : 3874812 : gsi_next (&si))
350 : : {
351 : 3877173 : if (is_gimple_debug (gsi_stmt (si)))
352 : 1220894 : continue;
353 : 2656279 : stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 : 2656279 : opt_result res
355 : 2656279 : = vect_determine_vf_for_stmt (loop_vinfo,
356 : : stmt_info, &vectorization_factor);
357 : 2656279 : if (!res)
358 : 2361 : return res;
359 : : }
360 : : }
361 : :
362 : : /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 : 264564 : if (dump_enabled_p ())
364 : : {
365 : 17314 : dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 : 17314 : dump_dec (MSG_NOTE, vectorization_factor);
367 : 17314 : dump_printf (MSG_NOTE, "\n");
368 : : }
369 : :
370 : 264564 : if (known_le (vectorization_factor, 1U))
371 : 33622 : return opt_result::failure_at (vect_location,
372 : : "not vectorized: unsupported data-type\n");
373 : 230942 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 : 230942 : return opt_result::success ();
375 : : }
376 : :
377 : :
378 : : /* Function vect_is_simple_iv_evolution.
379 : :
380 : : FORNOW: A simple evolution of an induction variables in the loop is
381 : : considered a polynomial evolution. */
382 : :
383 : : static bool
384 : 664949 : vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 : : tree * step)
386 : : {
387 : 664949 : tree init_expr;
388 : 664949 : tree step_expr;
389 : 664949 : tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 : 664949 : basic_block bb;
391 : :
392 : : /* When there is no evolution in this loop, the evolution function
393 : : is not "simple". */
394 : 664949 : if (evolution_part == NULL_TREE)
395 : : return false;
396 : :
397 : : /* When the evolution is a polynomial of degree >= 2
398 : : the evolution function is not "simple". */
399 : 713569 : if (tree_is_chrec (evolution_part))
400 : : return false;
401 : :
402 : 614764 : step_expr = evolution_part;
403 : 614764 : init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
404 : :
405 : 614764 : if (dump_enabled_p ())
406 : 39669 : dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 : : step_expr, init_expr);
408 : :
409 : 614764 : *init = init_expr;
410 : 614764 : *step = step_expr;
411 : :
412 : 614764 : if (TREE_CODE (step_expr) != INTEGER_CST
413 : 55330 : && (TREE_CODE (step_expr) != SSA_NAME
414 : 48494 : || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 : 48278 : && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 : 6737 : || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 : 117 : && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 : 117 : || !flag_associative_math)))
419 : 663444 : && (TREE_CODE (step_expr) != REAL_CST
420 : 510 : || !flag_associative_math))
421 : : {
422 : 48620 : if (dump_enabled_p ())
423 : 3133 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 : : "step unknown.\n");
425 : 48620 : return false;
426 : : }
427 : :
428 : : return true;
429 : : }
430 : :
431 : : /* Function vect_is_nonlinear_iv_evolution
432 : :
433 : : Only support nonlinear induction for integer type
434 : : 1. neg
435 : : 2. mul by constant
436 : : 3. lshift/rshift by constant.
437 : :
438 : : For neg induction, return a fake step as integer -1. */
439 : : static bool
440 : 96520 : vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 : : gphi* loop_phi_node, tree *init, tree *step)
442 : : {
443 : 96520 : tree init_expr, ev_expr, result, op1, op2;
444 : 96520 : gimple* def;
445 : :
446 : 96520 : if (gimple_phi_num_args (loop_phi_node) != 2)
447 : : return false;
448 : :
449 : 96520 : init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 : 96520 : ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
451 : :
452 : : /* Support nonlinear induction only for integer type. */
453 : 96520 : if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 : : return false;
455 : :
456 : 70237 : *init = init_expr;
457 : 70237 : result = PHI_RESULT (loop_phi_node);
458 : :
459 : 70237 : if (TREE_CODE (ev_expr) != SSA_NAME
460 : 67408 : || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 : 70237 : || !is_gimple_assign (def))
462 : : return false;
463 : :
464 : 62631 : enum tree_code t_code = gimple_assign_rhs_code (def);
465 : 62631 : switch (t_code)
466 : : {
467 : 1573 : case NEGATE_EXPR:
468 : 1573 : if (gimple_assign_rhs1 (def) != result)
469 : : return false;
470 : 1573 : *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 : 1573 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 : 1573 : break;
473 : :
474 : 9555 : case RSHIFT_EXPR:
475 : 9555 : case LSHIFT_EXPR:
476 : 9555 : case MULT_EXPR:
477 : 9555 : op1 = gimple_assign_rhs1 (def);
478 : 9555 : op2 = gimple_assign_rhs2 (def);
479 : 9555 : if (TREE_CODE (op2) != INTEGER_CST
480 : 6101 : || op1 != result)
481 : : return false;
482 : 5984 : *step = op2;
483 : 5984 : if (t_code == LSHIFT_EXPR)
484 : 188 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 : 5796 : else if (t_code == RSHIFT_EXPR)
486 : 5121 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 : : /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 : : else
489 : 675 : STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 : : break;
491 : :
492 : : default:
493 : : return false;
494 : : }
495 : :
496 : 7557 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 : 7557 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
498 : :
499 : 7557 : return true;
500 : : }
501 : :
502 : : /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 : : what we are assuming is a double reduction. For example, given
504 : : a structure like this:
505 : :
506 : : outer1:
507 : : x_1 = PHI <x_4(outer2), ...>;
508 : : ...
509 : :
510 : : inner:
511 : : x_2 = PHI <x_1(outer1), ...>;
512 : : ...
513 : : x_3 = ...;
514 : : ...
515 : :
516 : : outer2:
517 : : x_4 = PHI <x_3(inner)>;
518 : : ...
519 : :
520 : : outer loop analysis would treat x_1 as a double reduction phi and
521 : : this function would then return true for x_2. */
522 : :
523 : : static bool
524 : 665915 : vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
525 : : {
526 : 665915 : use_operand_p use_p;
527 : 665915 : ssa_op_iter op_iter;
528 : 1996753 : FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 : 1331804 : if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 : 664739 : if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 : : return true;
532 : : return false;
533 : : }
534 : :
535 : : /* Returns true if Phi is a first-order recurrence. A first-order
536 : : recurrence is a non-reduction recurrence relation in which the value of
537 : : the recurrence in the current loop iteration equals a value defined in
538 : : the previous iteration. */
539 : :
540 : : static bool
541 : 21974 : vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 : : gphi *phi)
543 : : {
544 : : /* A nested cycle isn't vectorizable as first order recurrence. */
545 : 21974 : if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 : : return false;
547 : :
548 : : /* Ensure the loop latch definition is from within the loop. */
549 : 21737 : edge latch = loop_latch_edge (loop);
550 : 21737 : tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 : 21737 : if (TREE_CODE (ldef) != SSA_NAME
552 : 18633 : || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 : 18605 : || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 : 39159 : || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 : 4625 : return false;
556 : :
557 : 17112 : tree def = gimple_phi_result (phi);
558 : :
559 : : /* Ensure every use_stmt of the phi node is dominated by the latch
560 : : definition. */
561 : 17112 : imm_use_iterator imm_iter;
562 : 17112 : use_operand_p use_p;
563 : 19052 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 : 18740 : if (!is_gimple_debug (USE_STMT (use_p))
565 : 36598 : && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 : 10721 : || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 : : USE_STMT (use_p))))
568 : 16800 : return false;
569 : :
570 : : /* First-order recurrence autovectorization needs shuffle vector. */
571 : 312 : tree scalar_type = TREE_TYPE (def);
572 : 312 : tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 : 312 : if (!vectype)
574 : : return false;
575 : :
576 : : return true;
577 : : }
578 : :
579 : : /* Function vect_analyze_scalar_cycles_1.
580 : :
581 : : Examine the cross iteration def-use cycles of scalar variables
582 : : in LOOP. LOOP_VINFO represents the loop that is now being
583 : : considered for vectorization (can be LOOP, or an outer-loop
584 : : enclosing LOOP). SLP indicates there will be some subsequent
585 : : slp analyses or not. */
586 : :
587 : : static void
588 : 324795 : vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 : : bool slp)
590 : : {
591 : 324795 : basic_block bb = loop->header;
592 : 324795 : tree init, step;
593 : 324795 : auto_vec<stmt_vec_info, 64> worklist;
594 : 324795 : gphi_iterator gsi;
595 : 324795 : bool double_reduc, reduc_chain;
596 : :
597 : 324795 : DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
598 : :
599 : : /* First - identify all inductions. Reduction detection assumes that all the
600 : : inductions have been identified, therefore, this order must not be
601 : : changed. */
602 : 1172336 : for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
603 : : {
604 : 847541 : gphi *phi = gsi.phi ();
605 : 847541 : tree access_fn = NULL;
606 : 847541 : tree def = PHI_RESULT (phi);
607 : 847541 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
608 : :
609 : 847541 : if (dump_enabled_p ())
610 : 56614 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 : : (gimple *) phi);
612 : :
613 : : /* Skip virtual phi's. The data dependences that are associated with
614 : : virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 : 1695082 : if (virtual_operand_p (def))
616 : 273846 : continue;
617 : :
618 : 665915 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
619 : :
620 : : /* Analyze the evolution function. */
621 : 665915 : access_fn = analyze_scalar_evolution (loop, def);
622 : 665915 : if (access_fn)
623 : : {
624 : 665915 : STRIP_NOPS (access_fn);
625 : 665915 : if (dump_enabled_p ())
626 : 41585 : dump_printf_loc (MSG_NOTE, vect_location,
627 : : "Access function of PHI: %T\n", access_fn);
628 : 665915 : STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 : 665915 : = initial_condition_in_loop_num (access_fn, loop->num);
630 : 665915 : STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 : 665915 : = evolution_part_in_loop_num (access_fn, loop->num);
632 : : }
633 : :
634 : 758135 : if ((!access_fn
635 : 665915 : || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 : 664949 : || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 : : &init, &step)
638 : 566144 : || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 : 9831 : && TREE_CODE (step) != INTEGER_CST))
640 : : /* Only handle nonlinear iv for same loop. */
641 : 765692 : && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 : 96520 : || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 : : phi, &init, &step)))
644 : : {
645 : 92220 : worklist.safe_push (stmt_vinfo);
646 : 92220 : continue;
647 : : }
648 : :
649 : 573695 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 : : != NULL_TREE);
651 : 573695 : gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
652 : :
653 : 573695 : if (dump_enabled_p ())
654 : 36652 : dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 : 573695 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
656 : :
657 : : /* Mark if we have a non-linear IV. */
658 : 573695 : LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
659 : 573695 : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add;
660 : : }
661 : :
662 : :
663 : : /* Second - identify all reductions and nested cycles. */
664 : 417015 : while (worklist.length () > 0)
665 : : {
666 : 92220 : stmt_vec_info stmt_vinfo = worklist.pop ();
667 : 92220 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
668 : 92220 : tree def = PHI_RESULT (phi);
669 : :
670 : 92220 : if (dump_enabled_p ())
671 : 4933 : dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
672 : : (gimple *) phi);
673 : :
674 : 184440 : gcc_assert (!virtual_operand_p (def)
675 : : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
676 : :
677 : 92220 : stmt_vec_info reduc_stmt_info
678 : 92220 : = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
679 : 92220 : &reduc_chain, slp);
680 : 92220 : if (reduc_stmt_info)
681 : : {
682 : 70246 : STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
683 : 70246 : STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
684 : 70246 : if (double_reduc)
685 : : {
686 : 966 : if (dump_enabled_p ())
687 : 101 : dump_printf_loc (MSG_NOTE, vect_location,
688 : : "Detected double reduction.\n");
689 : :
690 : 966 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
691 : 966 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
692 : : /* Make it accessible for SLP vectorization. */
693 : 966 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
694 : : }
695 : : else
696 : : {
697 : 69280 : if (loop != LOOP_VINFO_LOOP (loop_vinfo))
698 : : {
699 : 3020 : if (dump_enabled_p ())
700 : 465 : dump_printf_loc (MSG_NOTE, vect_location,
701 : : "Detected vectorizable nested cycle.\n");
702 : :
703 : 3020 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
704 : : }
705 : : else
706 : : {
707 : 66260 : if (dump_enabled_p ())
708 : 3902 : dump_printf_loc (MSG_NOTE, vect_location,
709 : : "Detected reduction.\n");
710 : :
711 : 66260 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
712 : 66260 : STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
713 : : /* Store the reduction cycles for possible vectorization in
714 : : loop-aware SLP if it was not detected as reduction
715 : : chain. */
716 : 66260 : if (! reduc_chain)
717 : 65433 : LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
718 : 65433 : (reduc_stmt_info);
719 : : }
720 : : }
721 : : }
722 : 21974 : else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
723 : 306 : STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
724 : : else
725 : 21668 : if (dump_enabled_p ())
726 : 414 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
727 : : "Unknown def-use cycle pattern.\n");
728 : : }
729 : 324795 : }
730 : :
731 : :
732 : : /* Function vect_analyze_scalar_cycles.
733 : :
734 : : Examine the cross iteration def-use cycles of scalar variables, by
735 : : analyzing the loop-header PHIs of scalar variables. Classify each
736 : : cycle as one of the following: invariant, induction, reduction, unknown.
737 : : We do that for the loop represented by LOOP_VINFO, and also to its
738 : : inner-loop, if exists.
739 : : Examples for scalar cycles:
740 : :
741 : : Example1: reduction:
742 : :
743 : : loop1:
744 : : for (i=0; i<N; i++)
745 : : sum += a[i];
746 : :
747 : : Example2: induction:
748 : :
749 : : loop2:
750 : : for (i=0; i<N; i++)
751 : : a[i] = i; */
752 : :
753 : : static void
754 : 319808 : vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
755 : : {
756 : 319808 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
757 : :
758 : 319808 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
759 : :
760 : : /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
761 : : Reductions in such inner-loop therefore have different properties than
762 : : the reductions in the nest that gets vectorized:
763 : : 1. When vectorized, they are executed in the same order as in the original
764 : : scalar loop, so we can't change the order of computation when
765 : : vectorizing them.
766 : : 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
767 : : current checks are too strict. */
768 : :
769 : 319808 : if (loop->inner)
770 : 4987 : vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
771 : 319808 : }
772 : :
773 : : /* Transfer group and reduction information from STMT_INFO to its
774 : : pattern stmt. */
775 : :
776 : : static void
777 : 29 : vect_fixup_reduc_chain (stmt_vec_info stmt_info)
778 : : {
779 : 29 : stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
780 : 29 : stmt_vec_info stmtp;
781 : 29 : gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
782 : : && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
783 : 29 : REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
784 : 246 : do
785 : : {
786 : 246 : stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
787 : 246 : gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
788 : : == STMT_VINFO_DEF_TYPE (stmt_info));
789 : 246 : REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
790 : 246 : stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
791 : 246 : if (stmt_info)
792 : 217 : REDUC_GROUP_NEXT_ELEMENT (stmtp)
793 : 217 : = STMT_VINFO_RELATED_STMT (stmt_info);
794 : : }
795 : 246 : while (stmt_info);
796 : 29 : }
797 : :
798 : : /* Fixup scalar cycles that now have their stmts detected as patterns. */
799 : :
800 : : static void
801 : 319808 : vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
802 : : {
803 : 319808 : stmt_vec_info first;
804 : 319808 : unsigned i;
805 : :
806 : 320635 : FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
807 : : {
808 : 827 : stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
809 : 3634 : while (next)
810 : : {
811 : 2835 : if ((STMT_VINFO_IN_PATTERN_P (next)
812 : 2835 : != STMT_VINFO_IN_PATTERN_P (first))
813 : 5642 : || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
814 : : break;
815 : 2807 : next = REDUC_GROUP_NEXT_ELEMENT (next);
816 : : }
817 : : /* If all reduction chain members are well-formed patterns adjust
818 : : the group to group the pattern stmts instead. */
819 : 827 : if (! next
820 : 856 : && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
821 : : {
822 : 799 : if (STMT_VINFO_IN_PATTERN_P (first))
823 : : {
824 : 29 : vect_fixup_reduc_chain (first);
825 : 58 : LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
826 : 29 : = STMT_VINFO_RELATED_STMT (first);
827 : : }
828 : : }
829 : : /* If not all stmt in the chain are patterns or if we failed
830 : : to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
831 : : it as regular reduction instead. */
832 : : else
833 : : {
834 : : stmt_vec_info vinfo = first;
835 : : stmt_vec_info last = NULL;
836 : 117 : while (vinfo)
837 : : {
838 : 89 : next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
839 : 89 : REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
840 : 89 : REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
841 : 89 : last = vinfo;
842 : 89 : vinfo = next;
843 : : }
844 : 28 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
845 : 28 : = vect_internal_def;
846 : 31 : loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
847 : 28 : LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
848 : 28 : --i;
849 : : }
850 : : }
851 : 319808 : }
852 : :
853 : : /* Function vect_get_loop_niters.
854 : :
855 : : Determine how many iterations the loop is executed and place it
856 : : in NUMBER_OF_ITERATIONS. Place the number of latch iterations
857 : : in NUMBER_OF_ITERATIONSM1. Place the condition under which the
858 : : niter information holds in ASSUMPTIONS.
859 : :
860 : : Return the loop exit conditions. */
861 : :
862 : :
863 : : static vec<gcond *>
864 : 259326 : vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
865 : : tree *number_of_iterations, tree *number_of_iterationsm1)
866 : : {
867 : 259326 : auto_vec<edge> exits = get_loop_exit_edges (loop);
868 : 259326 : vec<gcond *> conds;
869 : 518652 : conds.create (exits.length ());
870 : 259326 : class tree_niter_desc niter_desc;
871 : 259326 : tree niter_assumptions, niter, may_be_zero;
872 : :
873 : 259326 : *assumptions = boolean_true_node;
874 : 259326 : *number_of_iterationsm1 = chrec_dont_know;
875 : 259326 : *number_of_iterations = chrec_dont_know;
876 : :
877 : 259326 : DUMP_VECT_SCOPE ("get_loop_niters");
878 : :
879 : 259326 : if (exits.is_empty ())
880 : 0 : return conds;
881 : :
882 : 259326 : if (dump_enabled_p ())
883 : 13534 : dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
884 : : exits.length ());
885 : :
886 : : edge exit;
887 : : unsigned int i;
888 : 626914 : FOR_EACH_VEC_ELT (exits, i, exit)
889 : : {
890 : 367588 : gcond *cond = get_loop_exit_condition (exit);
891 : 367588 : if (cond)
892 : 357413 : conds.safe_push (cond);
893 : :
894 : 367588 : if (dump_enabled_p ())
895 : 14495 : dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
896 : :
897 : 367588 : if (exit != main_exit)
898 : 148474 : continue;
899 : :
900 : 259326 : may_be_zero = NULL_TREE;
901 : 259326 : if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
902 : 259326 : || chrec_contains_undetermined (niter_desc.niter))
903 : 40212 : continue;
904 : :
905 : 219114 : niter_assumptions = niter_desc.assumptions;
906 : 219114 : may_be_zero = niter_desc.may_be_zero;
907 : 219114 : niter = niter_desc.niter;
908 : :
909 : 219114 : if (may_be_zero && integer_zerop (may_be_zero))
910 : : may_be_zero = NULL_TREE;
911 : :
912 : 12423 : if (may_be_zero)
913 : : {
914 : 12423 : if (COMPARISON_CLASS_P (may_be_zero))
915 : : {
916 : : /* Try to combine may_be_zero with assumptions, this can simplify
917 : : computation of niter expression. */
918 : 12423 : if (niter_assumptions && !integer_nonzerop (niter_assumptions))
919 : 1066 : niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
920 : : niter_assumptions,
921 : : fold_build1 (TRUTH_NOT_EXPR,
922 : : boolean_type_node,
923 : : may_be_zero));
924 : : else
925 : 11357 : niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
926 : : build_int_cst (TREE_TYPE (niter), 0),
927 : : rewrite_to_non_trapping_overflow (niter));
928 : :
929 : 219114 : may_be_zero = NULL_TREE;
930 : : }
931 : 0 : else if (integer_nonzerop (may_be_zero))
932 : : {
933 : 0 : *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
934 : 0 : *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
935 : 0 : continue;
936 : : }
937 : : else
938 : 0 : continue;
939 : : }
940 : :
941 : : /* Loop assumptions are based off the normal exit. */
942 : 219114 : *assumptions = niter_assumptions;
943 : 219114 : *number_of_iterationsm1 = niter;
944 : :
945 : : /* We want the number of loop header executions which is the number
946 : : of latch executions plus one.
947 : : ??? For UINT_MAX latch executions this number overflows to zero
948 : : for loops like do { n++; } while (n != 0); */
949 : 219114 : if (niter && !chrec_contains_undetermined (niter))
950 : : {
951 : 219114 : niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
952 : : unshare_expr (niter),
953 : : build_int_cst (TREE_TYPE (niter), 1));
954 : 219114 : if (TREE_CODE (niter) == INTEGER_CST
955 : 118599 : && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
956 : : {
957 : : /* If we manage to fold niter + 1 into INTEGER_CST even when
958 : : niter is some complex expression, ensure back
959 : : *number_of_iterationsm1 is an INTEGER_CST as well. See
960 : : PR113210. */
961 : 4 : *number_of_iterationsm1
962 : 4 : = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
963 : : build_minus_one_cst (TREE_TYPE (niter)));
964 : : }
965 : : }
966 : 219114 : *number_of_iterations = niter;
967 : : }
968 : :
969 : 259326 : if (dump_enabled_p ())
970 : 13534 : dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
971 : :
972 : 259326 : return conds;
973 : 259326 : }
974 : :
975 : : /* Determine the main loop exit for the vectorizer. */
976 : :
977 : : edge
978 : 488267 : vec_init_loop_exit_info (class loop *loop)
979 : : {
980 : : /* Before we begin we must first determine which exit is the main one and
981 : : which are auxilary exits. */
982 : 488267 : auto_vec<edge> exits = get_loop_exit_edges (loop);
983 : 488267 : if (exits.length () == 1)
984 : 311721 : return exits[0];
985 : :
986 : : /* If we have multiple exits we only support counting IV at the moment.
987 : : Analyze all exits and return the last one we can analyze. */
988 : 176546 : class tree_niter_desc niter_desc;
989 : 176546 : edge candidate = NULL;
990 : 1166822 : for (edge exit : exits)
991 : : {
992 : 647042 : if (!get_loop_exit_condition (exit))
993 : 157877 : continue;
994 : :
995 : 489165 : if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
996 : 489165 : && !chrec_contains_undetermined (niter_desc.niter))
997 : : {
998 : 134740 : tree may_be_zero = niter_desc.may_be_zero;
999 : 134740 : if ((integer_zerop (may_be_zero)
1000 : : /* As we are handling may_be_zero that's not false by
1001 : : rewriting niter to may_be_zero ? 0 : niter we require
1002 : : an empty latch. */
1003 : 659828 : || (single_pred_p (loop->latch)
1004 : 12425 : && exit->src == single_pred (loop->latch)
1005 : 4102 : && (integer_nonzerop (may_be_zero)
1006 : 4102 : || COMPARISON_CLASS_P (may_be_zero))))
1007 : 138842 : && (!candidate
1008 : 6492 : || dominated_by_p (CDI_DOMINATORS, exit->src,
1009 : 6492 : candidate->src)))
1010 : : candidate = exit;
1011 : : }
1012 : : }
1013 : :
1014 : 176546 : return candidate;
1015 : 176546 : }
1016 : :
1017 : : /* Function bb_in_loop_p
1018 : :
1019 : : Used as predicate for dfs order traversal of the loop bbs. */
1020 : :
1021 : : static bool
1022 : 1314345 : bb_in_loop_p (const_basic_block bb, const void *data)
1023 : : {
1024 : 1314345 : const class loop *const loop = (const class loop *)data;
1025 : 1314345 : if (flow_bb_inside_loop_p (loop, bb))
1026 : : return true;
1027 : : return false;
1028 : : }
1029 : :
1030 : :
1031 : : /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1032 : : stmt_vec_info structs for all the stmts in LOOP_IN. */
1033 : :
1034 : 419214 : _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1035 : : : vec_info (vec_info::loop, shared),
1036 : 419214 : loop (loop_in),
1037 : 419214 : num_itersm1 (NULL_TREE),
1038 : 419214 : num_iters (NULL_TREE),
1039 : 419214 : num_iters_unchanged (NULL_TREE),
1040 : 419214 : num_iters_assumptions (NULL_TREE),
1041 : 419214 : vector_costs (nullptr),
1042 : 419214 : scalar_costs (nullptr),
1043 : 419214 : th (0),
1044 : 419214 : versioning_threshold (0),
1045 : 419214 : vectorization_factor (0),
1046 : 419214 : main_loop_edge (nullptr),
1047 : 419214 : skip_main_loop_edge (nullptr),
1048 : 419214 : skip_this_loop_edge (nullptr),
1049 : 419214 : reusable_accumulators (),
1050 : 419214 : suggested_unroll_factor (1),
1051 : 419214 : max_vectorization_factor (0),
1052 : 419214 : mask_skip_niters (NULL_TREE),
1053 : 419214 : mask_skip_niters_pfa_offset (NULL_TREE),
1054 : 419214 : rgroup_compare_type (NULL_TREE),
1055 : 419214 : simd_if_cond (NULL_TREE),
1056 : 419214 : partial_vector_style (vect_partial_vectors_none),
1057 : 419214 : unaligned_dr (NULL),
1058 : 419214 : peeling_for_alignment (0),
1059 : 419214 : ptr_mask (0),
1060 : 419214 : nonlinear_iv (false),
1061 : 419214 : ivexpr_map (NULL),
1062 : 419214 : scan_map (NULL),
1063 : 419214 : slp_unrolling_factor (1),
1064 : 419214 : inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1065 : 419214 : vectorizable (false),
1066 : 419214 : can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1067 : 419214 : must_use_partial_vectors_p (false),
1068 : 419214 : using_partial_vectors_p (false),
1069 : 419214 : using_decrementing_iv_p (false),
1070 : 419214 : using_select_vl_p (false),
1071 : 419214 : epil_using_partial_vectors_p (false),
1072 : 419214 : partial_load_store_bias (0),
1073 : 419214 : peeling_for_gaps (false),
1074 : 419214 : peeling_for_niter (false),
1075 : 419214 : early_breaks (false),
1076 : 419214 : no_data_dependencies (false),
1077 : 419214 : has_mask_store (false),
1078 : 419214 : scalar_loop_scaling (profile_probability::uninitialized ()),
1079 : 419214 : scalar_loop (NULL),
1080 : 419214 : main_loop_info (NULL),
1081 : 419214 : orig_loop_info (NULL),
1082 : 419214 : epilogue_vinfo (NULL),
1083 : 419214 : drs_advanced_by (NULL_TREE),
1084 : 419214 : vec_loop_iv_exit (NULL),
1085 : 419214 : vec_epilogue_loop_iv_exit (NULL),
1086 : 838428 : scalar_loop_iv_exit (NULL)
1087 : : {
1088 : : /* CHECKME: We want to visit all BBs before their successors (except for
1089 : : latch blocks, for which this assertion wouldn't hold). In the simple
1090 : : case of the loop forms we allow, a dfs order of the BBs would the same
1091 : : as reversed postorder traversal, so we are safe. */
1092 : :
1093 : 419214 : bbs = XCNEWVEC (basic_block, loop->num_nodes);
1094 : 838428 : nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
1095 : 419214 : loop->num_nodes, loop);
1096 : 419214 : gcc_assert (nbbs == loop->num_nodes);
1097 : :
1098 : 1508284 : for (unsigned int i = 0; i < nbbs; i++)
1099 : : {
1100 : 1089070 : basic_block bb = bbs[i];
1101 : 1089070 : gimple_stmt_iterator si;
1102 : :
1103 : 2229900 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1104 : : {
1105 : 1140830 : gimple *phi = gsi_stmt (si);
1106 : 1140830 : gimple_set_uid (phi, 0);
1107 : 1140830 : add_stmt (phi);
1108 : : }
1109 : :
1110 : 9360649 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1111 : : {
1112 : 7182509 : gimple *stmt = gsi_stmt (si);
1113 : 7182509 : gimple_set_uid (stmt, 0);
1114 : 7182509 : if (is_gimple_debug (stmt))
1115 : 2664345 : continue;
1116 : 4518164 : add_stmt (stmt);
1117 : : /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1118 : : third argument is the #pragma omp simd if (x) condition, when 0,
1119 : : loop shouldn't be vectorized, when non-zero constant, it should
1120 : : be vectorized normally, otherwise versioned with vectorized loop
1121 : : done if the condition is non-zero at runtime. */
1122 : 4518164 : if (loop_in->simduid
1123 : 43826 : && is_gimple_call (stmt)
1124 : 4292 : && gimple_call_internal_p (stmt)
1125 : 4153 : && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1126 : 4152 : && gimple_call_num_args (stmt) >= 3
1127 : 103 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1128 : 4518267 : && (loop_in->simduid
1129 : 103 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1130 : : {
1131 : 103 : tree arg = gimple_call_arg (stmt, 2);
1132 : 103 : if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1133 : 103 : simd_if_cond = arg;
1134 : : else
1135 : 0 : gcc_assert (integer_nonzerop (arg));
1136 : : }
1137 : : }
1138 : : }
1139 : 419214 : }
1140 : :
1141 : : /* Free all levels of rgroup CONTROLS. */
1142 : :
1143 : : void
1144 : 1096714 : release_vec_loop_controls (vec<rgroup_controls> *controls)
1145 : : {
1146 : 1096714 : rgroup_controls *rgc;
1147 : 1096714 : unsigned int i;
1148 : 1096739 : FOR_EACH_VEC_ELT (*controls, i, rgc)
1149 : 25 : rgc->controls.release ();
1150 : 1096714 : controls->release ();
1151 : 1096714 : }
1152 : :
1153 : : /* Free all memory used by the _loop_vec_info, as well as all the
1154 : : stmt_vec_info structs of all the stmts in the loop. */
1155 : :
1156 : 419214 : _loop_vec_info::~_loop_vec_info ()
1157 : : {
1158 : 419214 : free (bbs);
1159 : :
1160 : 419214 : release_vec_loop_controls (&masks.rgc_vec);
1161 : 419214 : release_vec_loop_controls (&lens);
1162 : 422978 : delete ivexpr_map;
1163 : 419536 : delete scan_map;
1164 : 419214 : delete scalar_costs;
1165 : 419214 : delete vector_costs;
1166 : :
1167 : : /* When we release an epiloge vinfo that we do not intend to use
1168 : : avoid clearing AUX of the main loop which should continue to
1169 : : point to the main loop vinfo since otherwise we'll leak that. */
1170 : 419214 : if (loop->aux == this)
1171 : 56764 : loop->aux = NULL;
1172 : 838428 : }
1173 : :
1174 : : /* Return an invariant or register for EXPR and emit necessary
1175 : : computations in the LOOP_VINFO loop preheader. */
1176 : :
1177 : : tree
1178 : 19209 : cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1179 : : {
1180 : 19209 : if (is_gimple_reg (expr)
1181 : 19209 : || is_gimple_min_invariant (expr))
1182 : 6324 : return expr;
1183 : :
1184 : 12885 : if (! loop_vinfo->ivexpr_map)
1185 : 3764 : loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1186 : 12885 : tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1187 : 12885 : if (! cached)
1188 : : {
1189 : 8388 : gimple_seq stmts = NULL;
1190 : 8388 : cached = force_gimple_operand (unshare_expr (expr),
1191 : : &stmts, true, NULL_TREE);
1192 : 8388 : if (stmts)
1193 : : {
1194 : 8246 : edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1195 : 8246 : gsi_insert_seq_on_edge_immediate (e, stmts);
1196 : : }
1197 : : }
1198 : 12885 : return cached;
1199 : : }
1200 : :
1201 : : /* Return true if we can use CMP_TYPE as the comparison type to produce
1202 : : all masks required to mask LOOP_VINFO. */
1203 : :
1204 : : static bool
1205 : 97 : can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1206 : : {
1207 : 97 : rgroup_controls *rgm;
1208 : 97 : unsigned int i;
1209 : 110 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1210 : 110 : if (rgm->type != NULL_TREE
1211 : 110 : && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1212 : : cmp_type, rgm->type,
1213 : : OPTIMIZE_FOR_SPEED))
1214 : : return false;
1215 : : return true;
1216 : : }
1217 : :
1218 : : /* Calculate the maximum number of scalars per iteration for every
1219 : : rgroup in LOOP_VINFO. */
1220 : :
1221 : : static unsigned int
1222 : 23 : vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1223 : : {
1224 : 23 : unsigned int res = 1;
1225 : 23 : unsigned int i;
1226 : 23 : rgroup_controls *rgm;
1227 : 67 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1228 : 44 : res = MAX (res, rgm->max_nscalars_per_iter);
1229 : 23 : return res;
1230 : : }
1231 : :
1232 : : /* Calculate the minimum precision necessary to represent:
1233 : :
1234 : : MAX_NITERS * FACTOR
1235 : :
1236 : : as an unsigned integer, where MAX_NITERS is the maximum number of
1237 : : loop header iterations for the original scalar form of LOOP_VINFO. */
1238 : :
1239 : : static unsigned
1240 : 23 : vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1241 : : {
1242 : 23 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1243 : :
1244 : : /* Get the maximum number of iterations that is representable
1245 : : in the counter type. */
1246 : 23 : tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1247 : 23 : widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1248 : :
1249 : : /* Get a more refined estimate for the number of iterations. */
1250 : 23 : widest_int max_back_edges;
1251 : 23 : if (max_loop_iterations (loop, &max_back_edges))
1252 : 23 : max_ni = wi::smin (max_ni, max_back_edges + 1);
1253 : :
1254 : : /* Work out how many bits we need to represent the limit. */
1255 : 23 : return wi::min_precision (max_ni * factor, UNSIGNED);
1256 : 23 : }
1257 : :
1258 : : /* True if the loop needs peeling or partial vectors when vectorized. */
1259 : :
1260 : : static bool
1261 : 122847 : vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1262 : : {
1263 : 122847 : unsigned HOST_WIDE_INT const_vf;
1264 : 122847 : HOST_WIDE_INT max_niter
1265 : 122847 : = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1266 : :
1267 : 122847 : unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1268 : 122847 : if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1269 : 25992 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1270 : : (loop_vinfo));
1271 : :
1272 : 122847 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1273 : 53941 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1274 : : {
1275 : : /* Work out the (constant) number of iterations that need to be
1276 : : peeled for reasons other than niters. */
1277 : 53926 : unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1278 : 53926 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1279 : 498 : peel_niter += 1;
1280 : 121817 : if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1281 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1282 : : return true;
1283 : : }
1284 : 68921 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1285 : : /* ??? When peeling for gaps but not alignment, we could
1286 : : try to check whether the (variable) niters is known to be
1287 : : VF * N + 1. That's something of a niche case though. */
1288 : 68750 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1289 : 67760 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1290 : 136681 : || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1291 : 135520 : < (unsigned) exact_log2 (const_vf))
1292 : : /* In case of versioning, check if the maximum number of
1293 : : iterations is greater than th. If they are identical,
1294 : : the epilogue is unnecessary. */
1295 : 66750 : && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1296 : 4286 : || ((unsigned HOST_WIDE_INT) max_niter
1297 : : /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1298 : : but that's only computed later based on our result.
1299 : : The following is the most conservative approximation. */
1300 : 4286 : > (std::max ((unsigned HOST_WIDE_INT) th,
1301 : 4286 : const_vf) / const_vf) * const_vf))))
1302 : 67891 : return true;
1303 : :
1304 : : return false;
1305 : : }
1306 : :
1307 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1308 : : whether we can actually generate the masks required. Return true if so,
1309 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1310 : :
1311 : : static bool
1312 : 23 : vect_verify_full_masking (loop_vec_info loop_vinfo)
1313 : : {
1314 : 23 : unsigned int min_ni_width;
1315 : :
1316 : : /* Use a normal loop if there are no statements that need masking.
1317 : : This only happens in rare degenerate cases: it means that the loop
1318 : : has no loads, no stores, and no live-out values. */
1319 : 23 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1320 : : return false;
1321 : :
1322 : : /* Produce the rgroup controls. */
1323 : 81 : for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1324 : : {
1325 : 29 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1326 : 29 : tree vectype = mask.first;
1327 : 29 : unsigned nvectors = mask.second;
1328 : :
1329 : 35 : if (masks->rgc_vec.length () < nvectors)
1330 : 26 : masks->rgc_vec.safe_grow_cleared (nvectors, true);
1331 : 29 : rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1332 : : /* The number of scalars per iteration and the number of vectors are
1333 : : both compile-time constants. */
1334 : 29 : unsigned int nscalars_per_iter
1335 : 29 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1336 : 29 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1337 : :
1338 : 29 : if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1339 : : {
1340 : 29 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1341 : 29 : rgm->type = truth_type_for (vectype);
1342 : 29 : rgm->factor = 1;
1343 : : }
1344 : : }
1345 : :
1346 : 23 : unsigned int max_nscalars_per_iter
1347 : 23 : = vect_get_max_nscalars_per_iter (loop_vinfo);
1348 : :
1349 : : /* Work out how many bits we need to represent the limit. */
1350 : 23 : min_ni_width
1351 : 23 : = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1352 : :
1353 : : /* Find a scalar mode for which WHILE_ULT is supported. */
1354 : 23 : opt_scalar_int_mode cmp_mode_iter;
1355 : 23 : tree cmp_type = NULL_TREE;
1356 : 23 : tree iv_type = NULL_TREE;
1357 : 23 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1358 : 23 : unsigned int iv_precision = UINT_MAX;
1359 : :
1360 : 23 : if (iv_limit != -1)
1361 : 23 : iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1362 : : UNSIGNED);
1363 : :
1364 : 184 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1365 : : {
1366 : 161 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1367 : 161 : if (cmp_bits >= min_ni_width
1368 : 161 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1369 : : {
1370 : 97 : tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1371 : 97 : if (this_type
1372 : 97 : && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1373 : : {
1374 : : /* Although we could stop as soon as we find a valid mode,
1375 : : there are at least two reasons why that's not always the
1376 : : best choice:
1377 : :
1378 : : - An IV that's Pmode or wider is more likely to be reusable
1379 : : in address calculations than an IV that's narrower than
1380 : : Pmode.
1381 : :
1382 : : - Doing the comparison in IV_PRECISION or wider allows
1383 : : a natural 0-based IV, whereas using a narrower comparison
1384 : : type requires mitigations against wrap-around.
1385 : :
1386 : : Conversely, if the IV limit is variable, doing the comparison
1387 : : in a wider type than the original type can introduce
1388 : : unnecessary extensions, so picking the widest valid mode
1389 : : is not always a good choice either.
1390 : :
1391 : : Here we prefer the first IV type that's Pmode or wider,
1392 : : and the first comparison type that's IV_PRECISION or wider.
1393 : : (The comparison type must be no wider than the IV type,
1394 : : to avoid extensions in the vector loop.)
1395 : :
1396 : : ??? We might want to try continuing beyond Pmode for ILP32
1397 : : targets if CMP_BITS < IV_PRECISION. */
1398 : 0 : iv_type = this_type;
1399 : 0 : if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1400 : : cmp_type = this_type;
1401 : 0 : if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1402 : : break;
1403 : : }
1404 : : }
1405 : : }
1406 : :
1407 : 23 : if (!cmp_type)
1408 : : {
1409 : 23 : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1410 : 23 : return false;
1411 : : }
1412 : :
1413 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1414 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1415 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1416 : 0 : return true;
1417 : 23 : }
1418 : :
1419 : : /* Each statement in LOOP_VINFO can be masked where necessary. Check
1420 : : whether we can actually generate AVX512 style masks. Return true if so,
1421 : : storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1422 : :
1423 : : static bool
1424 : 23 : vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1425 : : {
1426 : : /* Produce differently organized rgc_vec and differently check
1427 : : we can produce masks. */
1428 : :
1429 : : /* Use a normal loop if there are no statements that need masking.
1430 : : This only happens in rare degenerate cases: it means that the loop
1431 : : has no loads, no stores, and no live-out values. */
1432 : 23 : if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1433 : : return false;
1434 : :
1435 : : /* For the decrementing IV we need to represent all values in
1436 : : [0, niter + niter_skip] where niter_skip is the elements we
1437 : : skip in the first iteration for prologue peeling. */
1438 : 23 : tree iv_type = NULL_TREE;
1439 : 23 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1440 : 23 : unsigned int iv_precision = UINT_MAX;
1441 : 23 : if (iv_limit != -1)
1442 : 23 : iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1443 : :
1444 : : /* First compute the type for the IV we use to track the remaining
1445 : : scalar iterations. */
1446 : 23 : opt_scalar_int_mode cmp_mode_iter;
1447 : 41 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1448 : : {
1449 : 41 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1450 : 41 : if (cmp_bits >= iv_precision
1451 : 41 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1452 : : {
1453 : 23 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
1454 : 23 : if (iv_type)
1455 : : break;
1456 : : }
1457 : : }
1458 : 23 : if (!iv_type)
1459 : : return false;
1460 : :
1461 : : /* Produce the rgroup controls. */
1462 : 81 : for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1463 : : {
1464 : 29 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1465 : 29 : tree vectype = mask.first;
1466 : 29 : unsigned nvectors = mask.second;
1467 : :
1468 : : /* The number of scalars per iteration and the number of vectors are
1469 : : both compile-time constants. */
1470 : 29 : unsigned int nscalars_per_iter
1471 : 29 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1472 : 29 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1473 : :
1474 : : /* We index the rgroup_controls vector with nscalars_per_iter
1475 : : which we keep constant and instead have a varying nvectors,
1476 : : remembering the vector mask with the fewest nV. */
1477 : 35 : if (masks->rgc_vec.length () < nscalars_per_iter)
1478 : 23 : masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1479 : 29 : rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1480 : :
1481 : 29 : if (!rgm->type || rgm->factor > nvectors)
1482 : : {
1483 : 26 : rgm->type = truth_type_for (vectype);
1484 : 26 : rgm->compare_type = NULL_TREE;
1485 : 26 : rgm->max_nscalars_per_iter = nscalars_per_iter;
1486 : 26 : rgm->factor = nvectors;
1487 : 26 : rgm->bias_adjusted_ctrl = NULL_TREE;
1488 : : }
1489 : : }
1490 : :
1491 : : /* There is no fixed compare type we are going to use but we have to
1492 : : be able to get at one for each mask group. */
1493 : 23 : unsigned int min_ni_width
1494 : 23 : = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1495 : :
1496 : 23 : bool ok = true;
1497 : 94 : for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1498 : : {
1499 : 25 : tree mask_type = rgc.type;
1500 : 25 : if (!mask_type)
1501 : 2 : continue;
1502 : :
1503 : : /* For now vect_get_loop_mask only supports integer mode masks
1504 : : when we need to split it. */
1505 : 23 : if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1506 : 23 : || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1507 : : {
1508 : : ok = false;
1509 : : break;
1510 : : }
1511 : :
1512 : : /* If iv_type is usable as compare type use that - we can elide the
1513 : : saturation in that case. */
1514 : 23 : if (TYPE_PRECISION (iv_type) >= min_ni_width)
1515 : : {
1516 : 23 : tree cmp_vectype
1517 : 23 : = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1518 : 23 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1519 : 1 : rgc.compare_type = cmp_vectype;
1520 : : }
1521 : 23 : if (!rgc.compare_type)
1522 : 55 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1523 : : {
1524 : 55 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1525 : 55 : if (cmp_bits >= min_ni_width
1526 : 55 : && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1527 : : {
1528 : 55 : tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1529 : 55 : if (!cmp_type)
1530 : 0 : continue;
1531 : :
1532 : : /* Check whether we can produce the mask with cmp_type. */
1533 : 55 : tree cmp_vectype
1534 : 55 : = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1535 : 55 : if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1536 : : {
1537 : 22 : rgc.compare_type = cmp_vectype;
1538 : 22 : break;
1539 : : }
1540 : : }
1541 : : }
1542 : 23 : if (!rgc.compare_type)
1543 : : {
1544 : : ok = false;
1545 : : break;
1546 : : }
1547 : : }
1548 : 23 : if (!ok)
1549 : : {
1550 : 0 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1551 : 0 : return false;
1552 : : }
1553 : :
1554 : 23 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1555 : 23 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1556 : 23 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1557 : 23 : return true;
1558 : 23 : }
1559 : :
1560 : : /* Check whether we can use vector access with length based on precison
1561 : : comparison. So far, to keep it simple, we only allow the case that the
1562 : : precision of the target supported length is larger than the precision
1563 : : required by loop niters. */
1564 : :
1565 : : static bool
1566 : 0 : vect_verify_loop_lens (loop_vec_info loop_vinfo)
1567 : : {
1568 : 0 : if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1569 : : return false;
1570 : :
1571 : 0 : if (!VECTOR_MODE_P (loop_vinfo->vector_mode))
1572 : : return false;
1573 : :
1574 : 0 : machine_mode len_load_mode, len_store_mode;
1575 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1576 : 0 : .exists (&len_load_mode))
1577 : 0 : return false;
1578 : 0 : if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1579 : 0 : .exists (&len_store_mode))
1580 : 0 : return false;
1581 : :
1582 : 0 : signed char partial_load_bias = internal_len_load_store_bias
1583 : 0 : (IFN_LEN_LOAD, len_load_mode);
1584 : :
1585 : 0 : signed char partial_store_bias = internal_len_load_store_bias
1586 : 0 : (IFN_LEN_STORE, len_store_mode);
1587 : :
1588 : 0 : gcc_assert (partial_load_bias == partial_store_bias);
1589 : :
1590 : 0 : if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1591 : : return false;
1592 : :
1593 : : /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1594 : : len_loads with a length of zero. In order to avoid that we prohibit
1595 : : more than one loop length here. */
1596 : 0 : if (partial_load_bias == -1
1597 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1598 : : return false;
1599 : :
1600 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1601 : :
1602 : 0 : unsigned int max_nitems_per_iter = 1;
1603 : 0 : unsigned int i;
1604 : 0 : rgroup_controls *rgl;
1605 : : /* Find the maximum number of items per iteration for every rgroup. */
1606 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1607 : : {
1608 : 0 : unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1609 : 0 : max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1610 : : }
1611 : :
1612 : : /* Work out how many bits we need to represent the length limit. */
1613 : 0 : unsigned int min_ni_prec
1614 : 0 : = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1615 : :
1616 : : /* Now use the maximum of below precisions for one suitable IV type:
1617 : : - the IV's natural precision
1618 : : - the precision needed to hold: the maximum number of scalar
1619 : : iterations multiplied by the scale factor (min_ni_prec above)
1620 : : - the Pmode precision
1621 : :
1622 : : If min_ni_prec is less than the precision of the current niters,
1623 : : we perfer to still use the niters type. Prefer to use Pmode and
1624 : : wider IV to avoid narrow conversions. */
1625 : :
1626 : 0 : unsigned int ni_prec
1627 : 0 : = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1628 : 0 : min_ni_prec = MAX (min_ni_prec, ni_prec);
1629 : 0 : min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1630 : :
1631 : 0 : tree iv_type = NULL_TREE;
1632 : 0 : opt_scalar_int_mode tmode_iter;
1633 : 0 : FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1634 : : {
1635 : 0 : scalar_mode tmode = tmode_iter.require ();
1636 : 0 : unsigned int tbits = GET_MODE_BITSIZE (tmode);
1637 : :
1638 : : /* ??? Do we really want to construct one IV whose precision exceeds
1639 : : BITS_PER_WORD? */
1640 : 0 : if (tbits > BITS_PER_WORD)
1641 : : break;
1642 : :
1643 : : /* Find the first available standard integral type. */
1644 : 0 : if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1645 : : {
1646 : 0 : iv_type = build_nonstandard_integer_type (tbits, true);
1647 : 0 : break;
1648 : : }
1649 : : }
1650 : :
1651 : 0 : if (!iv_type)
1652 : : {
1653 : 0 : if (dump_enabled_p ())
1654 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1655 : : "can't vectorize with length-based partial vectors"
1656 : : " because there is no suitable iv type.\n");
1657 : 0 : return false;
1658 : : }
1659 : :
1660 : 0 : LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1661 : 0 : LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1662 : 0 : LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1663 : :
1664 : 0 : return true;
1665 : : }
1666 : :
1667 : : /* Calculate the cost of one scalar iteration of the loop. */
1668 : : static void
1669 : 230942 : vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1670 : : {
1671 : 230942 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1672 : 230942 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1673 : 230942 : int nbbs = loop->num_nodes, factor;
1674 : 230942 : int innerloop_iters, i;
1675 : :
1676 : 230942 : DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1677 : :
1678 : : /* Gather costs for statements in the scalar loop. */
1679 : :
1680 : : /* FORNOW. */
1681 : 230942 : innerloop_iters = 1;
1682 : 230942 : if (loop->inner)
1683 : 1184 : innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1684 : :
1685 : 804484 : for (i = 0; i < nbbs; i++)
1686 : : {
1687 : 573542 : gimple_stmt_iterator si;
1688 : 573542 : basic_block bb = bbs[i];
1689 : :
1690 : 573542 : if (bb->loop_father == loop->inner)
1691 : : factor = innerloop_iters;
1692 : : else
1693 : 571174 : factor = 1;
1694 : :
1695 : 4553405 : for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1696 : : {
1697 : 3406321 : gimple *stmt = gsi_stmt (si);
1698 : 3406321 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1699 : :
1700 : 3406321 : if (!is_gimple_assign (stmt)
1701 : : && !is_gimple_call (stmt)
1702 : : && !is_a<gcond *> (stmt))
1703 : 1039814 : continue;
1704 : :
1705 : : /* Skip stmts that are not vectorized inside the loop. */
1706 : 2366507 : stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1707 : 2366507 : if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1708 : 1053980 : && (!STMT_VINFO_LIVE_P (vstmt_info)
1709 : 77 : || !VECTORIZABLE_CYCLE_DEF
1710 : : (STMT_VINFO_DEF_TYPE (vstmt_info))))
1711 : 1053980 : continue;
1712 : :
1713 : 1312527 : vect_cost_for_stmt kind;
1714 : 1312527 : if (STMT_VINFO_DATA_REF (stmt_info))
1715 : : {
1716 : 560643 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1717 : : kind = scalar_load;
1718 : : else
1719 : 206470 : kind = scalar_store;
1720 : : }
1721 : 751884 : else if (vect_nop_conversion_p (stmt_info))
1722 : 35436 : continue;
1723 : : else
1724 : : kind = scalar_stmt;
1725 : :
1726 : : /* We are using vect_prologue here to avoid scaling twice
1727 : : by the inner loop factor. */
1728 : 1277091 : record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1729 : : factor, kind, stmt_info, 0, vect_prologue);
1730 : : }
1731 : : }
1732 : :
1733 : : /* Now accumulate cost. */
1734 : 230942 : loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1735 : 230942 : add_stmt_costs (loop_vinfo->scalar_costs,
1736 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1737 : 230942 : loop_vinfo->scalar_costs->finish_cost (nullptr);
1738 : 230942 : }
1739 : :
1740 : : /* Function vect_analyze_loop_form.
1741 : :
1742 : : Verify that certain CFG restrictions hold, including:
1743 : : - the loop has a pre-header
1744 : : - the loop has a single entry
1745 : : - nested loops can have only a single exit.
1746 : : - the loop exit condition is simple enough
1747 : : - the number of iterations can be analyzed, i.e, a countable loop. The
1748 : : niter could be analyzed under some assumptions. */
1749 : :
1750 : : opt_result
1751 : 456723 : vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1752 : : vect_loop_form_info *info)
1753 : : {
1754 : 456723 : DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1755 : :
1756 : 456723 : edge exit_e = vec_init_loop_exit_info (loop);
1757 : 456723 : if (!exit_e)
1758 : 56982 : return opt_result::failure_at (vect_location,
1759 : : "not vectorized:"
1760 : : " could not determine main exit from"
1761 : : " loop with multiple exits.\n");
1762 : 399741 : if (loop_vectorized_call)
1763 : : {
1764 : 25863 : tree arg = gimple_call_arg (loop_vectorized_call, 1);
1765 : 25863 : class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1766 : 25863 : edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1767 : 25863 : if (!scalar_exit_e)
1768 : 0 : return opt_result::failure_at (vect_location,
1769 : : "not vectorized:"
1770 : : " could not determine main exit from"
1771 : : " loop with multiple exits.\n");
1772 : : }
1773 : :
1774 : 399741 : info->loop_exit = exit_e;
1775 : 399741 : if (dump_enabled_p ())
1776 : 14828 : dump_printf_loc (MSG_NOTE, vect_location,
1777 : : "using as main loop exit: %d -> %d [AUX: %p]\n",
1778 : 14828 : exit_e->src->index, exit_e->dest->index, exit_e->aux);
1779 : :
1780 : : /* Check if we have any control flow that doesn't leave the loop. */
1781 : 399741 : basic_block *bbs = get_loop_body (loop);
1782 : 1341868 : for (unsigned i = 0; i < loop->num_nodes; i++)
1783 : 1049761 : if (EDGE_COUNT (bbs[i]->succs) != 1
1784 : 1049761 : && (EDGE_COUNT (bbs[i]->succs) != 2
1785 : 620830 : || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1786 : : {
1787 : 107634 : free (bbs);
1788 : 107634 : return opt_result::failure_at (vect_location,
1789 : : "not vectorized:"
1790 : : " unsupported control flow in loop.\n");
1791 : : }
1792 : 292107 : free (bbs);
1793 : :
1794 : : /* Different restrictions apply when we are considering an inner-most loop,
1795 : : vs. an outer (nested) loop.
1796 : : (FORNOW. May want to relax some of these restrictions in the future). */
1797 : :
1798 : 292107 : info->inner_loop_cond = NULL;
1799 : 292107 : if (!loop->inner)
1800 : : {
1801 : : /* Inner-most loop. */
1802 : :
1803 : 271105 : if (empty_block_p (loop->header))
1804 : 3 : return opt_result::failure_at (vect_location,
1805 : : "not vectorized: empty loop.\n");
1806 : : }
1807 : : else
1808 : : {
1809 : 21002 : class loop *innerloop = loop->inner;
1810 : 21002 : edge entryedge;
1811 : :
1812 : : /* Nested loop. We currently require that the loop is doubly-nested,
1813 : : contains a single inner loop with a single exit to the block
1814 : : with the single exit condition in the outer loop.
1815 : : Vectorizable outer-loops look like this:
1816 : :
1817 : : (pre-header)
1818 : : |
1819 : : header <---+
1820 : : | |
1821 : : inner-loop |
1822 : : | |
1823 : : tail ------+
1824 : : |
1825 : : (exit-bb)
1826 : :
1827 : : The inner-loop also has the properties expected of inner-most loops
1828 : : as described above. */
1829 : :
1830 : 21002 : if ((loop->inner)->inner || (loop->inner)->next)
1831 : 2969 : return opt_result::failure_at (vect_location,
1832 : : "not vectorized:"
1833 : : " multiple nested loops.\n");
1834 : :
1835 : 18033 : entryedge = loop_preheader_edge (innerloop);
1836 : 18033 : if (entryedge->src != loop->header
1837 : 17688 : || !single_exit (innerloop)
1838 : 29129 : || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1839 : 7219 : return opt_result::failure_at (vect_location,
1840 : : "not vectorized:"
1841 : : " unsupported outerloop form.\n");
1842 : :
1843 : : /* Analyze the inner-loop. */
1844 : 10814 : vect_loop_form_info inner;
1845 : 10814 : opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1846 : 10814 : if (!res)
1847 : : {
1848 : 1188 : if (dump_enabled_p ())
1849 : 5 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1850 : : "not vectorized: Bad inner loop.\n");
1851 : 1188 : return res;
1852 : : }
1853 : :
1854 : : /* Don't support analyzing niter under assumptions for inner
1855 : : loop. */
1856 : 9626 : if (!integer_onep (inner.assumptions))
1857 : 287 : return opt_result::failure_at (vect_location,
1858 : : "not vectorized: Bad inner loop.\n");
1859 : :
1860 : 9339 : if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1861 : 1083 : return opt_result::failure_at (vect_location,
1862 : : "not vectorized: inner-loop count not"
1863 : : " invariant.\n");
1864 : :
1865 : 8256 : if (dump_enabled_p ())
1866 : 924 : dump_printf_loc (MSG_NOTE, vect_location,
1867 : : "Considering outer-loop vectorization.\n");
1868 : 8256 : info->inner_loop_cond = inner.conds[0];
1869 : 10814 : }
1870 : :
1871 : 279358 : if (EDGE_COUNT (loop->header->preds) != 2)
1872 : 0 : return opt_result::failure_at (vect_location,
1873 : : "not vectorized:"
1874 : : " too many incoming edges.\n");
1875 : :
1876 : : /* We assume that the latch is empty. */
1877 : 279358 : basic_block latch = loop->latch;
1878 : 279358 : do
1879 : : {
1880 : 279358 : if (!empty_block_p (latch)
1881 : 279358 : || !gimple_seq_empty_p (phi_nodes (latch)))
1882 : 19988 : return opt_result::failure_at (vect_location,
1883 : : "not vectorized: latch block not "
1884 : : "empty.\n");
1885 : 259370 : latch = single_pred (latch);
1886 : : }
1887 : 518740 : while (single_succ_p (latch));
1888 : :
1889 : : /* Make sure there is no abnormal exit. */
1890 : 259370 : auto_vec<edge> exits = get_loop_exit_edges (loop);
1891 : 1145709 : for (edge e : exits)
1892 : : {
1893 : 367643 : if (e->flags & EDGE_ABNORMAL)
1894 : 44 : return opt_result::failure_at (vect_location,
1895 : : "not vectorized:"
1896 : : " abnormal loop exit edge.\n");
1897 : : }
1898 : :
1899 : 259326 : info->conds
1900 : 259326 : = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1901 : : &info->number_of_iterations,
1902 : 259326 : &info->number_of_iterationsm1);
1903 : 259326 : if (info->conds.is_empty ())
1904 : 32 : return opt_result::failure_at
1905 : 32 : (vect_location,
1906 : : "not vectorized: complicated exit condition.\n");
1907 : :
1908 : : /* Determine what the primary and alternate exit conds are. */
1909 : 616707 : for (unsigned i = 0; i < info->conds.length (); i++)
1910 : : {
1911 : 357413 : gcond *cond = info->conds[i];
1912 : 357413 : if (exit_e->src == gimple_bb (cond))
1913 : 259294 : std::swap (info->conds[0], info->conds[i]);
1914 : : }
1915 : :
1916 : 259294 : if (integer_zerop (info->assumptions)
1917 : 259294 : || !info->number_of_iterations
1918 : 518588 : || chrec_contains_undetermined (info->number_of_iterations))
1919 : 40180 : return opt_result::failure_at
1920 : 40180 : (info->conds[0],
1921 : : "not vectorized: number of iterations cannot be computed.\n");
1922 : :
1923 : 219114 : if (integer_zerop (info->number_of_iterations))
1924 : 14 : return opt_result::failure_at
1925 : 14 : (info->conds[0],
1926 : : "not vectorized: number of iterations = 0.\n");
1927 : :
1928 : 219100 : if (!(tree_fits_shwi_p (info->number_of_iterations)
1929 : 118578 : && tree_to_shwi (info->number_of_iterations) > 0))
1930 : : {
1931 : 100522 : if (dump_enabled_p ())
1932 : : {
1933 : 2259 : dump_printf_loc (MSG_NOTE, vect_location,
1934 : : "Symbolic number of iterations is ");
1935 : 2259 : dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1936 : 2259 : dump_printf (MSG_NOTE, "\n");
1937 : : }
1938 : : }
1939 : :
1940 : 219100 : return opt_result::success ();
1941 : 259370 : }
1942 : :
1943 : : /* Create a loop_vec_info for LOOP with SHARED and the
1944 : : vect_analyze_loop_form result. */
1945 : :
1946 : : loop_vec_info
1947 : 419214 : vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1948 : : const vect_loop_form_info *info,
1949 : : loop_vec_info orig_loop_info)
1950 : : {
1951 : 419214 : loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1952 : 419214 : LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1953 : 419214 : LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1954 : 419214 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1955 : 419214 : LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_info;
1956 : 419214 : if (orig_loop_info && LOOP_VINFO_EPILOGUE_P (orig_loop_info))
1957 : 141 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo)
1958 : 141 : = LOOP_VINFO_MAIN_LOOP_INFO (orig_loop_info);
1959 : : else
1960 : 419073 : LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo) = orig_loop_info;
1961 : : /* Also record the assumptions for versioning. */
1962 : 419214 : if (!integer_onep (info->assumptions) && !orig_loop_info)
1963 : 21336 : LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1964 : :
1965 : 1891860 : for (gcond *cond : info->conds)
1966 : : {
1967 : 634218 : stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1968 : 634218 : STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1969 : : /* Mark the statement as a condition. */
1970 : 634218 : STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1971 : : }
1972 : :
1973 : 634218 : for (unsigned i = 1; i < info->conds.length (); i ++)
1974 : 215004 : LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1975 : 419214 : LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1976 : :
1977 : 419214 : LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1978 : :
1979 : : /* Check to see if we're vectorizing multiple exits. */
1980 : 419214 : LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1981 : 419214 : = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1982 : :
1983 : 419214 : if (info->inner_loop_cond)
1984 : : {
1985 : 8445 : stmt_vec_info inner_loop_cond_info
1986 : 8445 : = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1987 : 8445 : STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1988 : : /* If we have an estimate on the number of iterations of the inner
1989 : : loop use that to limit the scale for costing, otherwise use
1990 : : --param vect-inner-loop-cost-factor literally. */
1991 : 8445 : widest_int nit;
1992 : 8445 : if (estimated_stmt_executions (loop->inner, &nit))
1993 : 7218 : LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1994 : 7218 : = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1995 : 8445 : }
1996 : :
1997 : 419214 : return loop_vinfo;
1998 : : }
1999 : :
2000 : :
2001 : :
2002 : : /* Scan the loop stmts and dependent on whether there are any (non-)SLP
2003 : : statements update the vectorization factor. */
2004 : :
2005 : : static void
2006 : 354890 : vect_update_vf_for_slp (loop_vec_info loop_vinfo)
2007 : : {
2008 : 354890 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2009 : 354890 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2010 : 354890 : int nbbs = loop->num_nodes;
2011 : 354890 : poly_uint64 vectorization_factor;
2012 : 354890 : int i;
2013 : :
2014 : 354890 : DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
2015 : :
2016 : 354890 : vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2017 : 354890 : gcc_assert (known_ne (vectorization_factor, 0U));
2018 : :
2019 : : /* If all the stmts in the loop can be SLPed, we perform only SLP, and
2020 : : vectorization factor of the loop is the unrolling factor required by
2021 : : the SLP instances. If that unrolling factor is 1, we say, that we
2022 : : perform pure SLP on loop - cross iteration parallelism is not
2023 : : exploited. */
2024 : : bool only_slp_in_loop = true;
2025 : 1281054 : for (i = 0; i < nbbs; i++)
2026 : : {
2027 : 926164 : basic_block bb = bbs[i];
2028 : 1820052 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2029 : 893888 : gsi_next (&si))
2030 : : {
2031 : 893888 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
2032 : 893888 : if (!stmt_info)
2033 : 0 : continue;
2034 : 893888 : if ((STMT_VINFO_RELEVANT_P (stmt_info)
2035 : 452497 : || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2036 : 441403 : && !PURE_SLP_STMT (stmt_info))
2037 : : /* STMT needs both SLP and loop-based vectorization. */
2038 : 893888 : only_slp_in_loop = false;
2039 : : }
2040 : 7041085 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2041 : 5188757 : gsi_next (&si))
2042 : : {
2043 : 5188757 : if (is_gimple_debug (gsi_stmt (si)))
2044 : 1683688 : continue;
2045 : 3505069 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2046 : 3505069 : stmt_info = vect_stmt_to_vectorize (stmt_info);
2047 : 3505069 : if ((STMT_VINFO_RELEVANT_P (stmt_info)
2048 : 1481600 : || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2049 : 2023473 : && !PURE_SLP_STMT (stmt_info))
2050 : : /* STMT needs both SLP and loop-based vectorization. */
2051 : 5188757 : only_slp_in_loop = false;
2052 : : }
2053 : : }
2054 : :
2055 : 354890 : if (only_slp_in_loop)
2056 : : {
2057 : 349674 : if (dump_enabled_p ())
2058 : 19966 : dump_printf_loc (MSG_NOTE, vect_location,
2059 : : "Loop contains only SLP stmts\n");
2060 : 349674 : vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2061 : : }
2062 : : else
2063 : : {
2064 : 5216 : if (dump_enabled_p ())
2065 : 252 : dump_printf_loc (MSG_NOTE, vect_location,
2066 : : "Loop contains SLP and non-SLP stmts\n");
2067 : : /* Both the vectorization factor and unroll factor have the form
2068 : : GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2069 : : so they must have a common multiple. */
2070 : 5216 : vectorization_factor
2071 : 5216 : = force_common_multiple (vectorization_factor,
2072 : 5216 : LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2073 : : }
2074 : :
2075 : 354890 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2076 : 354890 : if (dump_enabled_p ())
2077 : : {
2078 : 20218 : dump_printf_loc (MSG_NOTE, vect_location,
2079 : : "Updating vectorization factor to ");
2080 : 20218 : dump_dec (MSG_NOTE, vectorization_factor);
2081 : 20218 : dump_printf (MSG_NOTE, ".\n");
2082 : : }
2083 : 354890 : }
2084 : :
2085 : : /* Return true if STMT_INFO describes a double reduction phi and if
2086 : : the other phi in the reduction is also relevant for vectorization.
2087 : : This rejects cases such as:
2088 : :
2089 : : outer1:
2090 : : x_1 = PHI <x_3(outer2), ...>;
2091 : : ...
2092 : :
2093 : : inner:
2094 : : x_2 = ...;
2095 : : ...
2096 : :
2097 : : outer2:
2098 : : x_3 = PHI <x_2(inner)>;
2099 : :
2100 : : if nothing in x_2 or elsewhere makes x_1 relevant. */
2101 : :
2102 : : static bool
2103 : 157 : vect_active_double_reduction_p (stmt_vec_info stmt_info)
2104 : : {
2105 : 157 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2106 : : return false;
2107 : :
2108 : 0 : return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2109 : : }
2110 : :
2111 : : /* Function vect_analyze_loop_operations.
2112 : :
2113 : : Scan the loop stmts and make sure they are all vectorizable. */
2114 : :
2115 : : static opt_result
2116 : 117734 : vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2117 : : {
2118 : 117734 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2119 : 117734 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2120 : 117734 : int nbbs = loop->num_nodes;
2121 : 117734 : int i;
2122 : 117734 : stmt_vec_info stmt_info;
2123 : :
2124 : 117734 : DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2125 : :
2126 : 354751 : for (i = 0; i < nbbs; i++)
2127 : : {
2128 : 238413 : basic_block bb = bbs[i];
2129 : :
2130 : 575163 : for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2131 : 336750 : gsi_next (&si))
2132 : : {
2133 : 338060 : gphi *phi = si.phi ();
2134 : :
2135 : 338060 : stmt_info = loop_vinfo->lookup_stmt (phi);
2136 : 338060 : if (dump_enabled_p ())
2137 : 40030 : dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2138 : : (gimple *) phi);
2139 : 676120 : if (virtual_operand_p (gimple_phi_result (phi)))
2140 : 92813 : continue;
2141 : :
2142 : : /* ??? All of the below unconditional FAILs should be in
2143 : : done earlier after analyzing cycles, possibly when
2144 : : determining stmt relevancy? */
2145 : :
2146 : : /* Inner-loop loop-closed exit phi in outer-loop vectorization
2147 : : (i.e., a phi in the tail of the outer-loop). */
2148 : 245247 : if (! is_loop_header_bb_p (bb))
2149 : : {
2150 : : /* FORNOW: we currently don't support the case that these phis
2151 : : are not used in the outerloop (unless it is double reduction,
2152 : : i.e., this phi is vect_reduction_def), cause this case
2153 : : requires to actually do something here. */
2154 : 769 : if (STMT_VINFO_LIVE_P (stmt_info)
2155 : 872 : && !vect_active_double_reduction_p (stmt_info))
2156 : 54 : return opt_result::failure_at (phi,
2157 : : "Unsupported loop-closed phi"
2158 : : " in outer-loop.\n");
2159 : :
2160 : : /* If PHI is used in the outer loop, we check that its operand
2161 : : is defined in the inner loop. */
2162 : 715 : if (STMT_VINFO_RELEVANT_P (stmt_info))
2163 : : {
2164 : 711 : tree phi_op;
2165 : :
2166 : 711 : if (gimple_phi_num_args (phi) != 1)
2167 : 0 : return opt_result::failure_at (phi, "unsupported phi");
2168 : :
2169 : 711 : phi_op = PHI_ARG_DEF (phi, 0);
2170 : 711 : stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2171 : 711 : if (!op_def_info)
2172 : 0 : return opt_result::failure_at (phi, "unsupported phi\n");
2173 : :
2174 : 711 : if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2175 : 711 : && (STMT_VINFO_RELEVANT (op_def_info)
2176 : : != vect_used_in_outer_by_reduction))
2177 : 240 : return opt_result::failure_at (phi, "unsupported phi\n");
2178 : :
2179 : 471 : if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2180 : 103 : || (STMT_VINFO_DEF_TYPE (stmt_info)
2181 : : == vect_double_reduction_def))
2182 : 471 : && ! PURE_SLP_STMT (stmt_info))
2183 : 0 : return opt_result::failure_at (phi, "unsupported phi\n");
2184 : : }
2185 : :
2186 : 475 : continue;
2187 : 475 : }
2188 : :
2189 : 244478 : gcc_assert (stmt_info);
2190 : :
2191 : 244478 : if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2192 : 222638 : || STMT_VINFO_LIVE_P (stmt_info))
2193 : 24956 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2194 : 184 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2195 : : /* A scalar-dependence cycle that we don't support. */
2196 : 0 : return opt_result::failure_at (phi,
2197 : : "not vectorized:"
2198 : : " scalar dependence cycle.\n");
2199 : :
2200 : 244478 : if (STMT_VINFO_RELEVANT_P (stmt_info)
2201 : 75514 : && ! PURE_SLP_STMT (stmt_info))
2202 : 1016 : return opt_result::failure_at (phi,
2203 : : "not vectorized: relevant phi not "
2204 : : "supported: %G",
2205 : : static_cast <gimple *> (phi));
2206 : : }
2207 : :
2208 : 2075246 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2209 : 1601040 : gsi_next (&si))
2210 : : {
2211 : 1601126 : gimple *stmt = gsi_stmt (si);
2212 : 1601126 : if (!gimple_clobber_p (stmt)
2213 : 1601126 : && !is_gimple_debug (stmt))
2214 : : {
2215 : 1269250 : bool need_to_vectorize = false;
2216 : 1269250 : opt_result res
2217 : 1269250 : = vect_analyze_stmt (loop_vinfo,
2218 : : loop_vinfo->lookup_stmt (stmt),
2219 : : &need_to_vectorize,
2220 : : NULL, NULL, NULL);
2221 : 1269250 : if (!res)
2222 : 86 : return res;
2223 : : }
2224 : : }
2225 : : } /* bbs */
2226 : :
2227 : 116338 : return opt_result::success ();
2228 : : }
2229 : :
2230 : : /* Return true if we know that the iteration count is smaller than the
2231 : : vectorization factor. Return false if it isn't, or if we can't be sure
2232 : : either way. */
2233 : :
2234 : : static bool
2235 : 104208 : vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2236 : : {
2237 : 104208 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2238 : :
2239 : 104208 : HOST_WIDE_INT max_niter;
2240 : 104208 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2241 : 49866 : max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2242 : : else
2243 : 54342 : max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2244 : :
2245 : 104208 : if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2246 : 8029 : return true;
2247 : :
2248 : : return false;
2249 : : }
2250 : :
2251 : : /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2252 : : is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2253 : : definitely no, or -1 if it's worth retrying. */
2254 : :
2255 : : static int
2256 : 104214 : vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2257 : : unsigned *suggested_unroll_factor)
2258 : : {
2259 : 104214 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2260 : 104214 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2261 : :
2262 : : /* Only loops that can handle partially-populated vectors can have iteration
2263 : : counts less than the vectorization factor. */
2264 : 104214 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2265 : 104214 : && vect_known_niters_smaller_than_vf (loop_vinfo))
2266 : : {
2267 : 8021 : if (dump_enabled_p ())
2268 : 222 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269 : : "not vectorized: iteration count smaller than "
2270 : : "vectorization factor.\n");
2271 : 8021 : return 0;
2272 : : }
2273 : :
2274 : : /* If we know the number of iterations we can do better, for the
2275 : : epilogue we can also decide whether the main loop leaves us
2276 : : with enough iterations, prefering a smaller vector epilog then
2277 : : also possibly used for the case we skip the vector loop. */
2278 : 96193 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2279 : : {
2280 : 42140 : widest_int scalar_niters
2281 : 42140 : = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2282 : 42140 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2283 : : {
2284 : 2553 : loop_vec_info orig_loop_vinfo
2285 : : = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2286 : 2553 : loop_vec_info main_loop_vinfo
2287 : : = LOOP_VINFO_MAIN_LOOP_INFO (loop_vinfo);
2288 : 2553 : unsigned lowest_vf
2289 : 2553 : = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2290 : 2553 : int prolog_peeling = 0;
2291 : 2553 : if (!vect_use_loop_mask_for_alignment_p (main_loop_vinfo))
2292 : 2553 : prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (main_loop_vinfo);
2293 : 2553 : if (prolog_peeling >= 0
2294 : 2553 : && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2295 : : lowest_vf))
2296 : : {
2297 : 5092 : unsigned gap
2298 : 2546 : = LOOP_VINFO_PEELING_FOR_GAPS (main_loop_vinfo) ? 1 : 0;
2299 : 5092 : scalar_niters = ((scalar_niters - gap - prolog_peeling)
2300 : 5092 : % lowest_vf + gap);
2301 : : }
2302 : : }
2303 : : /* Reject vectorizing for a single scalar iteration, even if
2304 : : we could in principle implement that using partial vectors. */
2305 : 42140 : unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2306 : 42140 : if (scalar_niters <= peeling_gap + 1)
2307 : : {
2308 : 724 : if (dump_enabled_p ())
2309 : 162 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2310 : : "not vectorized: loop only has a single "
2311 : : "scalar iteration.\n");
2312 : 724 : return 0;
2313 : : }
2314 : :
2315 : 41416 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2316 : : {
2317 : : /* Check that the loop processes at least one full vector. */
2318 : 41407 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2319 : 41407 : if (known_lt (scalar_niters, vf))
2320 : : {
2321 : 354 : if (dump_enabled_p ())
2322 : 289 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2323 : : "loop does not have enough iterations "
2324 : : "to support vectorization.\n");
2325 : 394 : return 0;
2326 : : }
2327 : :
2328 : : /* If we need to peel an extra epilogue iteration to handle data
2329 : : accesses with gaps, check that there are enough scalar iterations
2330 : : available.
2331 : :
2332 : : The check above is redundant with this one when peeling for gaps,
2333 : : but the distinction is useful for diagnostics. */
2334 : 41053 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2335 : 41331 : && known_le (scalar_niters, vf))
2336 : : {
2337 : 40 : if (dump_enabled_p ())
2338 : 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2339 : : "loop does not have enough iterations "
2340 : : "to support peeling for gaps.\n");
2341 : 40 : return 0;
2342 : : }
2343 : : }
2344 : 42140 : }
2345 : :
2346 : : /* If using the "very cheap" model. reject cases in which we'd keep
2347 : : a copy of the scalar code (even if we might be able to vectorize it). */
2348 : 95075 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2349 : 95075 : && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2350 : 47127 : || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2351 : : {
2352 : 708 : if (dump_enabled_p ())
2353 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2354 : : "some scalar iterations would need to be peeled\n");
2355 : 708 : return 0;
2356 : : }
2357 : :
2358 : 94367 : int min_profitable_iters, min_profitable_estimate;
2359 : 94367 : vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2360 : : &min_profitable_estimate,
2361 : : suggested_unroll_factor);
2362 : :
2363 : 94367 : if (min_profitable_iters < 0)
2364 : : {
2365 : 25019 : if (dump_enabled_p ())
2366 : 18 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2367 : : "not vectorized: vectorization not profitable.\n");
2368 : 25019 : if (dump_enabled_p ())
2369 : 18 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2370 : : "not vectorized: vector version will never be "
2371 : : "profitable.\n");
2372 : 25019 : return -1;
2373 : : }
2374 : :
2375 : 69348 : int min_scalar_loop_bound = (param_min_vect_loop_bound
2376 : 69348 : * assumed_vf);
2377 : :
2378 : : /* Use the cost model only if it is more conservative than user specified
2379 : : threshold. */
2380 : 69348 : unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2381 : : min_profitable_iters);
2382 : :
2383 : 69348 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2384 : :
2385 : 35118 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2386 : 104466 : && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2387 : : {
2388 : 381 : if (dump_enabled_p ())
2389 : 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2390 : : "not vectorized: vectorization not profitable.\n");
2391 : 381 : if (dump_enabled_p ())
2392 : 1 : dump_printf_loc (MSG_NOTE, vect_location,
2393 : : "not vectorized: iteration count smaller than user "
2394 : : "specified loop bound parameter or minimum profitable "
2395 : : "iterations (whichever is more conservative).\n");
2396 : 381 : return 0;
2397 : : }
2398 : :
2399 : : /* The static profitablity threshold min_profitable_estimate includes
2400 : : the cost of having to check at runtime whether the scalar loop
2401 : : should be used instead. If it turns out that we don't need or want
2402 : : such a check, the threshold we should use for the static estimate
2403 : : is simply the point at which the vector loop becomes more profitable
2404 : : than the scalar loop. */
2405 : 68967 : if (min_profitable_estimate > min_profitable_iters
2406 : 14814 : && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2407 : 14357 : && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2408 : 258 : && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2409 : 69225 : && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2410 : : {
2411 : 6 : if (dump_enabled_p ())
2412 : 2 : dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2413 : : " choice between the scalar and vector loops\n");
2414 : 6 : min_profitable_estimate = min_profitable_iters;
2415 : : }
2416 : :
2417 : : /* If the vector loop needs multiple iterations to be beneficial then
2418 : : things are probably too close to call, and the conservative thing
2419 : : would be to stick with the scalar code. */
2420 : 68967 : if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2421 : 68967 : && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2422 : : {
2423 : 8051 : if (dump_enabled_p ())
2424 : 177 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2425 : : "one iteration of the vector loop would be"
2426 : : " more expensive than the equivalent number of"
2427 : : " iterations of the scalar loop\n");
2428 : 8051 : return 0;
2429 : : }
2430 : :
2431 : 60916 : HOST_WIDE_INT estimated_niter;
2432 : :
2433 : : /* If we are vectorizing an epilogue then we know the maximum number of
2434 : : scalar iterations it will cover is at least one lower than the
2435 : : vectorization factor of the main loop. */
2436 : 60916 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2437 : 10196 : estimated_niter
2438 : 10196 : = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2439 : : else
2440 : : {
2441 : 50720 : estimated_niter = estimated_stmt_executions_int (loop);
2442 : 50720 : if (estimated_niter == -1)
2443 : 19284 : estimated_niter = likely_max_stmt_executions_int (loop);
2444 : : }
2445 : 29480 : if (estimated_niter != -1
2446 : 59343 : && ((unsigned HOST_WIDE_INT) estimated_niter
2447 : 59343 : < MAX (th, (unsigned) min_profitable_estimate)))
2448 : : {
2449 : 4145 : if (dump_enabled_p ())
2450 : 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2451 : : "not vectorized: estimated iteration count too "
2452 : : "small.\n");
2453 : 4145 : if (dump_enabled_p ())
2454 : 28 : dump_printf_loc (MSG_NOTE, vect_location,
2455 : : "not vectorized: estimated iteration count smaller "
2456 : : "than specified loop bound parameter or minimum "
2457 : : "profitable iterations (whichever is more "
2458 : : "conservative).\n");
2459 : 4145 : return -1;
2460 : : }
2461 : :
2462 : : return 1;
2463 : : }
2464 : :
2465 : : static opt_result
2466 : 217494 : vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2467 : : vec<data_reference_p> *datarefs)
2468 : : {
2469 : 658857 : for (unsigned i = 0; i < loop->num_nodes; i++)
2470 : 974406 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2471 : 3644620 : !gsi_end_p (gsi); gsi_next (&gsi))
2472 : : {
2473 : 3203257 : gimple *stmt = gsi_stmt (gsi);
2474 : 3203257 : if (is_gimple_debug (stmt))
2475 : 1176937 : continue;
2476 : 2026454 : opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2477 : : NULL, 0);
2478 : 2026454 : if (!res)
2479 : : {
2480 : 45974 : if (is_gimple_call (stmt) && loop->safelen)
2481 : : {
2482 : 400 : tree fndecl = gimple_call_fndecl (stmt), op;
2483 : 400 : if (fndecl == NULL_TREE
2484 : 400 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2485 : : {
2486 : 0 : fndecl = gimple_call_arg (stmt, 0);
2487 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2488 : 0 : fndecl = TREE_OPERAND (fndecl, 0);
2489 : 0 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2490 : : }
2491 : 400 : if (fndecl != NULL_TREE)
2492 : : {
2493 : 366 : cgraph_node *node = cgraph_node::get (fndecl);
2494 : 366 : if (node != NULL && node->simd_clones != NULL)
2495 : : {
2496 : 135 : unsigned int j, n = gimple_call_num_args (stmt);
2497 : 557 : for (j = 0; j < n; j++)
2498 : : {
2499 : 288 : op = gimple_call_arg (stmt, j);
2500 : 288 : if (DECL_P (op)
2501 : 288 : || (REFERENCE_CLASS_P (op)
2502 : 0 : && get_base_address (op)))
2503 : : break;
2504 : : }
2505 : 135 : op = gimple_call_lhs (stmt);
2506 : : /* Ignore #pragma omp declare simd functions
2507 : : if they don't have data references in the
2508 : : call stmt itself. */
2509 : 269 : if (j == n
2510 : 135 : && !(op
2511 : 124 : && (DECL_P (op)
2512 : 124 : || (REFERENCE_CLASS_P (op)
2513 : 0 : && get_base_address (op)))))
2514 : 134 : continue;
2515 : : }
2516 : : }
2517 : : }
2518 : 45840 : return res;
2519 : : }
2520 : : /* If dependence analysis will give up due to the limit on the
2521 : : number of datarefs stop here and fail fatally. */
2522 : 3464958 : if (datarefs->length ()
2523 : 1484478 : > (unsigned)param_loop_max_datarefs_for_datadeps)
2524 : 0 : return opt_result::failure_at (stmt, "exceeded param "
2525 : : "loop-max-datarefs-for-datadeps\n");
2526 : : }
2527 : 171654 : return opt_result::success ();
2528 : : }
2529 : :
2530 : : /* Look for SLP-only access groups and turn each individual access into its own
2531 : : group. */
2532 : : static void
2533 : 117734 : vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2534 : : {
2535 : 117734 : unsigned int i;
2536 : 117734 : struct data_reference *dr;
2537 : :
2538 : 117734 : DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2539 : :
2540 : 117734 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2541 : 536307 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2542 : : {
2543 : 305650 : gcc_assert (DR_REF (dr));
2544 : 305650 : stmt_vec_info stmt_info
2545 : 305650 : = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (DR_STMT (dr)));
2546 : :
2547 : : /* Check if the load is a part of an interleaving chain. */
2548 : 305650 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2549 : : {
2550 : 96412 : stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2551 : 96412 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2552 : 96412 : unsigned int group_size = DR_GROUP_SIZE (first_element);
2553 : :
2554 : : /* Check if SLP-only groups. */
2555 : 96412 : if (!STMT_SLP_TYPE (stmt_info)
2556 : 162 : && STMT_VINFO_SLP_VECT_ONLY (first_element))
2557 : : {
2558 : : /* Dissolve the group. */
2559 : 12 : STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2560 : :
2561 : 12 : stmt_vec_info vinfo = first_element;
2562 : 30 : while (vinfo)
2563 : : {
2564 : 18 : stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2565 : 18 : DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2566 : 18 : DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2567 : 18 : DR_GROUP_SIZE (vinfo) = 1;
2568 : 18 : if (STMT_VINFO_STRIDED_P (first_element)
2569 : : /* We cannot handle stores with gaps. */
2570 : 12 : || DR_IS_WRITE (dr_info->dr))
2571 : : {
2572 : 6 : STMT_VINFO_STRIDED_P (vinfo) = true;
2573 : 6 : DR_GROUP_GAP (vinfo) = 0;
2574 : : }
2575 : : else
2576 : 12 : DR_GROUP_GAP (vinfo) = group_size - 1;
2577 : : /* Duplicate and adjust alignment info, it needs to
2578 : : be present on each group leader, see dr_misalignment. */
2579 : 18 : if (vinfo != first_element)
2580 : : {
2581 : 6 : dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2582 : 6 : dr_info2->target_alignment = dr_info->target_alignment;
2583 : 6 : int misalignment = dr_info->misalignment;
2584 : 6 : if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2585 : : {
2586 : 0 : HOST_WIDE_INT diff
2587 : 0 : = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2588 : 0 : - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2589 : 0 : unsigned HOST_WIDE_INT align_c
2590 : 0 : = dr_info->target_alignment.to_constant ();
2591 : 0 : misalignment = (misalignment + diff) % align_c;
2592 : : }
2593 : 6 : dr_info2->misalignment = misalignment;
2594 : : }
2595 : : vinfo = next;
2596 : : }
2597 : : }
2598 : : }
2599 : : }
2600 : 117734 : }
2601 : :
2602 : : /* Determine if operating on full vectors for LOOP_VINFO might leave
2603 : : some scalar iterations still to do. If so, decide how we should
2604 : : handle those scalar iterations. The possibilities are:
2605 : :
2606 : : (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2607 : : In this case:
2608 : :
2609 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2610 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2611 : : LOOP_VINFO_PEELING_FOR_NITER == false
2612 : :
2613 : : (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2614 : : to handle the remaining scalar iterations. In this case:
2615 : :
2616 : : LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2617 : : LOOP_VINFO_PEELING_FOR_NITER == true
2618 : :
2619 : : There are two choices:
2620 : :
2621 : : (2a) Consider vectorizing the epilogue loop at the same VF as the
2622 : : main loop, but using partial vectors instead of full vectors.
2623 : : In this case:
2624 : :
2625 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2626 : :
2627 : : (2b) Consider vectorizing the epilogue loop at lower VFs only.
2628 : : In this case:
2629 : :
2630 : : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2631 : : */
2632 : :
2633 : : opt_result
2634 : 122847 : vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2635 : : {
2636 : : /* Determine whether there would be any scalar iterations left over. */
2637 : 122847 : bool need_peeling_or_partial_vectors_p
2638 : 122847 : = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2639 : :
2640 : : /* Decide whether to vectorize the loop with partial vectors. */
2641 : 122847 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2642 : 122847 : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2643 : 122847 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2644 : 25 : && LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo))
2645 : 0 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2646 : 122847 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2647 : 25 : && need_peeling_or_partial_vectors_p)
2648 : : {
2649 : : /* For partial-vector-usage=1, try to push the handling of partial
2650 : : vectors to the epilogue, with the main loop continuing to operate
2651 : : on full vectors.
2652 : :
2653 : : If we are unrolling we also do not want to use partial vectors. This
2654 : : is to avoid the overhead of generating multiple masks and also to
2655 : : avoid having to execute entire iterations of FALSE masked instructions
2656 : : when dealing with one or less full iterations.
2657 : :
2658 : : ??? We could then end up failing to use partial vectors if we
2659 : : decide to peel iterations into a prologue, and if the main loop
2660 : : then ends up processing fewer than VF iterations. */
2661 : 20 : if ((param_vect_partial_vector_usage == 1
2662 : 6 : || loop_vinfo->suggested_unroll_factor > 1)
2663 : 14 : && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2664 : 30 : && !vect_known_niters_smaller_than_vf (loop_vinfo))
2665 : 2 : LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2666 : : else
2667 : 18 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2668 : : }
2669 : :
2670 : 122847 : if (LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo)
2671 : 0 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2672 : 0 : return opt_result::failure_at (vect_location,
2673 : : "not vectorized: loop needs but cannot "
2674 : : "use partial vectors\n");
2675 : :
2676 : 122847 : if (dump_enabled_p ())
2677 : 14332 : dump_printf_loc (MSG_NOTE, vect_location,
2678 : : "operating on %s vectors%s.\n",
2679 : 14332 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2680 : : ? "partial" : "full",
2681 : 14332 : LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2682 : : ? " for epilogue loop" : "");
2683 : :
2684 : 122847 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2685 : 245694 : = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2686 : 122847 : && need_peeling_or_partial_vectors_p);
2687 : :
2688 : : /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2689 : : analysis that we don't know whether the loop is vectorized by partial
2690 : : vectors (More details see tree-vect-loop-manip.cc).
2691 : :
2692 : : However, SELECT_VL vectorizaton style should only applied on partial
2693 : : vectorization since SELECT_VL is the GIMPLE IR that calculates the
2694 : : number of elements to be process for each iteration.
2695 : :
2696 : : After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2697 : : if it is not partial vectorized loop. */
2698 : 122847 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2699 : 122829 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2700 : :
2701 : 122847 : return opt_result::success ();
2702 : : }
2703 : :
2704 : : /* Function vect_analyze_loop_2.
2705 : :
2706 : : Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2707 : : analyses will record information in some members of LOOP_VINFO. FATAL
2708 : : indicates if some analysis meets fatal error. If one non-NULL pointer
2709 : : SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2710 : : worked out suggested unroll factor, while one NULL pointer shows it's
2711 : : going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2712 : : is to hold the slp decision when the suggested unroll factor is worked
2713 : : out. */
2714 : : static opt_result
2715 : 418496 : vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2716 : : unsigned *suggested_unroll_factor,
2717 : : unsigned& slp_done_for_suggested_uf)
2718 : : {
2719 : 418496 : opt_result ok = opt_result::success ();
2720 : 418496 : int res;
2721 : 418496 : unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2722 : 418496 : poly_uint64 min_vf = 2;
2723 : 418496 : loop_vec_info orig_loop_vinfo = NULL;
2724 : :
2725 : : /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2726 : : loop_vec_info of the first vectorized loop. */
2727 : 418496 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2728 : 28735 : orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2729 : : else
2730 : : orig_loop_vinfo = loop_vinfo;
2731 : 28735 : gcc_assert (orig_loop_vinfo);
2732 : :
2733 : : /* The first group of checks is independent of the vector size. */
2734 : 418496 : fatal = true;
2735 : :
2736 : 418496 : if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2737 : 418496 : && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2738 : 5 : return opt_result::failure_at (vect_location,
2739 : : "not vectorized: simd if(0)\n");
2740 : :
2741 : : /* Find all data references in the loop (which correspond to vdefs/vuses)
2742 : : and analyze their evolution in the loop. */
2743 : :
2744 : 418491 : loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2745 : :
2746 : : /* Gather the data references and count stmts in the loop. */
2747 : 418491 : if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2748 : : {
2749 : 217494 : opt_result res
2750 : 217494 : = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2751 : : &LOOP_VINFO_DATAREFS (loop_vinfo));
2752 : 217494 : if (!res)
2753 : : {
2754 : 45840 : if (dump_enabled_p ())
2755 : 1465 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2756 : : "not vectorized: loop contains function "
2757 : : "calls or data references that cannot "
2758 : : "be analyzed\n");
2759 : 45840 : return res;
2760 : : }
2761 : 171654 : loop_vinfo->shared->save_datarefs ();
2762 : : }
2763 : : else
2764 : 200997 : loop_vinfo->shared->check_datarefs ();
2765 : :
2766 : : /* Analyze the data references and also adjust the minimal
2767 : : vectorization factor according to the loads and stores. */
2768 : :
2769 : 372651 : ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2770 : 372651 : if (!ok)
2771 : : {
2772 : 52843 : if (dump_enabled_p ())
2773 : 944 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2774 : : "bad data references.\n");
2775 : 52843 : return ok;
2776 : : }
2777 : :
2778 : : /* Check if we are applying unroll factor now. */
2779 : 319808 : bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2780 : 319808 : gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2781 : :
2782 : : /* If the slp decision is false when suggested unroll factor is worked
2783 : : out, and we are applying suggested unroll factor, we can simply skip
2784 : : all slp related analyses this time. */
2785 : 319808 : unsigned slp = !applying_suggested_uf ? 2 : slp_done_for_suggested_uf;
2786 : :
2787 : : /* Classify all cross-iteration scalar data-flow cycles.
2788 : : Cross-iteration cycles caused by virtual phis are analyzed separately. */
2789 : 319808 : vect_analyze_scalar_cycles (loop_vinfo, slp == 2);
2790 : :
2791 : 319808 : vect_pattern_recog (loop_vinfo);
2792 : :
2793 : 319808 : vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2794 : :
2795 : : /* Analyze the access patterns of the data-refs in the loop (consecutive,
2796 : : complex, etc.). FORNOW: Only handle consecutive access pattern. */
2797 : :
2798 : 319808 : ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2799 : 319808 : if (!ok)
2800 : : {
2801 : 6174 : if (dump_enabled_p ())
2802 : 260 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2803 : : "bad data access.\n");
2804 : 6174 : return ok;
2805 : : }
2806 : :
2807 : : /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2808 : :
2809 : 313634 : ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2810 : 313634 : if (!ok)
2811 : : {
2812 : 12741 : if (dump_enabled_p ())
2813 : 327 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2814 : : "unexpected pattern.\n");
2815 : 12741 : return ok;
2816 : : }
2817 : :
2818 : : /* While the rest of the analysis below depends on it in some way. */
2819 : 300893 : fatal = false;
2820 : :
2821 : : /* Analyze data dependences between the data-refs in the loop
2822 : : and adjust the maximum vectorization factor according to
2823 : : the dependences.
2824 : : FORNOW: fail at the first data dependence that we encounter. */
2825 : :
2826 : 300893 : ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2827 : 300893 : if (!ok)
2828 : : {
2829 : 14017 : if (dump_enabled_p ())
2830 : 368 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2831 : : "bad data dependence.\n");
2832 : 14017 : return ok;
2833 : : }
2834 : 286876 : if (max_vf != MAX_VECTORIZATION_FACTOR
2835 : 286876 : && maybe_lt (max_vf, min_vf))
2836 : 48 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2837 : 286828 : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2838 : :
2839 : 286828 : ok = vect_determine_vectorization_factor (loop_vinfo);
2840 : 286828 : if (!ok)
2841 : : {
2842 : 55886 : if (dump_enabled_p ())
2843 : 778 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2844 : : "can't determine vectorization factor.\n");
2845 : 55886 : return ok;
2846 : : }
2847 : :
2848 : : /* Compute the scalar iteration cost. */
2849 : 230942 : vect_compute_single_scalar_iteration_cost (loop_vinfo);
2850 : :
2851 : 230942 : poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2852 : 230942 : bool saved_can_use_partial_vectors_p
2853 : : = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2854 : :
2855 : : /* This is the point where we can re-start analysis with single-lane
2856 : : SLP forced. */
2857 : 360085 : start_over:
2858 : :
2859 : : /* Check the SLP opportunities in the loop, analyze and build
2860 : : SLP trees. */
2861 : 720170 : ok = vect_analyze_slp (loop_vinfo, loop_vinfo->stmt_vec_infos.length (),
2862 : : slp == 1);
2863 : 360085 : if (!ok)
2864 : 0 : return ok;
2865 : :
2866 : : /* If there are any SLP instances mark them as pure_slp. */
2867 : 360085 : if (vect_make_slp_decision (loop_vinfo))
2868 : : {
2869 : : /* Find stmts that need to be both vectorized and SLPed. */
2870 : 354890 : vect_detect_hybrid_slp (loop_vinfo);
2871 : :
2872 : : /* Update the vectorization factor based on the SLP decision. */
2873 : 354890 : vect_update_vf_for_slp (loop_vinfo);
2874 : :
2875 : : /* Optimize the SLP graph with the vectorization factor fixed. */
2876 : 354890 : vect_optimize_slp (loop_vinfo);
2877 : :
2878 : : /* Gather the loads reachable from the SLP graph entries. */
2879 : 354890 : vect_gather_slp_loads (loop_vinfo);
2880 : : }
2881 : :
2882 : : /* We don't expect to have to roll back to anything other than an empty
2883 : : set of rgroups. */
2884 : 360085 : gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2885 : :
2886 : : /* Apply the suggested unrolling factor, this was determined by the backend
2887 : : during finish_cost the first time we ran the analyzis for this
2888 : : vector mode. */
2889 : 360085 : if (applying_suggested_uf)
2890 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2891 : :
2892 : : /* Now the vectorization factor is final. */
2893 : 360085 : poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2894 : 360085 : gcc_assert (known_ne (vectorization_factor, 0U));
2895 : :
2896 : 360085 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2897 : : {
2898 : 14925 : dump_printf_loc (MSG_NOTE, vect_location,
2899 : : "vectorization_factor = ");
2900 : 14925 : dump_dec (MSG_NOTE, vectorization_factor);
2901 : 14925 : dump_printf (MSG_NOTE, ", niters = %wd\n",
2902 : 14925 : LOOP_VINFO_INT_NITERS (loop_vinfo));
2903 : : }
2904 : :
2905 : 360085 : if (max_vf != MAX_VECTORIZATION_FACTOR
2906 : 360085 : && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2907 : 1 : return opt_result::failure_at (vect_location, "bad data dependence.\n");
2908 : :
2909 : 360084 : loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2910 : :
2911 : : /* Analyze the alignment of the data-refs in the loop.
2912 : : Fail if a data reference is found that cannot be vectorized. */
2913 : :
2914 : 360084 : ok = vect_analyze_data_refs_alignment (loop_vinfo);
2915 : 360084 : if (!ok)
2916 : : {
2917 : 0 : if (dump_enabled_p ())
2918 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2919 : : "bad data alignment.\n");
2920 : 0 : return ok;
2921 : : }
2922 : :
2923 : : /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2924 : : It is important to call pruning after vect_analyze_data_ref_accesses,
2925 : : since we use grouping information gathered by interleaving analysis. */
2926 : 360084 : ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2927 : 360084 : if (!ok)
2928 : 14243 : return ok;
2929 : :
2930 : : /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2931 : : vectorization, since we do not want to add extra peeling or
2932 : : add versioning for alignment. */
2933 : 345841 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2934 : : /* This pass will decide on using loop versioning and/or loop peeling in
2935 : : order to enhance the alignment of data references in the loop. */
2936 : 318920 : ok = vect_enhance_data_refs_alignment (loop_vinfo);
2937 : 345841 : if (!ok)
2938 : 0 : return ok;
2939 : :
2940 : : /* Analyze operations in the SLP instances. We can't simply
2941 : : remove unsupported SLP instances as this makes the above
2942 : : SLP kind detection invalid and might also affect the VF. */
2943 : 345841 : if (! vect_slp_analyze_operations (loop_vinfo))
2944 : : {
2945 : 228107 : ok = opt_result::failure_at (vect_location,
2946 : : "unsupported SLP instances\n");
2947 : 228107 : goto again;
2948 : : }
2949 : :
2950 : : /* Dissolve SLP-only groups. */
2951 : 117734 : vect_dissolve_slp_only_groups (loop_vinfo);
2952 : :
2953 : : /* Scan all the remaining operations in the loop that we did not catch
2954 : : during SLP build and make sure we fail. */
2955 : 117734 : ok = vect_analyze_loop_operations (loop_vinfo);
2956 : 117734 : if (!ok)
2957 : : {
2958 : 1396 : ok = opt_result::failure_at (vect_location,
2959 : : "bad operation or unsupported loop bound\n");
2960 : 1396 : goto again;
2961 : : }
2962 : :
2963 : : /* For now, we don't expect to mix both masking and length approaches for one
2964 : : loop, disable it if both are recorded. */
2965 : 116338 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2966 : 23 : && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2967 : 116361 : && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2968 : : {
2969 : 0 : if (dump_enabled_p ())
2970 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2971 : : "can't vectorize a loop with partial vectors"
2972 : : " because we don't expect to mix different"
2973 : : " approaches with partial vectors for the"
2974 : : " same loop.\n");
2975 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2976 : : }
2977 : :
2978 : : /* If we still have the option of using partial vectors,
2979 : : check whether we can generate the necessary loop controls. */
2980 : 116338 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2981 : : {
2982 : 23 : if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2983 : : {
2984 : 23 : if (!vect_verify_full_masking (loop_vinfo)
2985 : 23 : && !vect_verify_full_masking_avx512 (loop_vinfo))
2986 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2987 : : }
2988 : : else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2989 : 0 : if (!vect_verify_loop_lens (loop_vinfo))
2990 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2991 : : }
2992 : :
2993 : : /* If we're vectorizing a loop that uses length "controls" and
2994 : : can iterate more than once, we apply decrementing IV approach
2995 : : in loop control. */
2996 : 116338 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2997 : 23 : && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2998 : 0 : && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2999 : 116338 : && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3000 : 0 : && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3001 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3002 : 0 : LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3003 : :
3004 : : /* If a loop uses length controls and has a decrementing loop control IV,
3005 : : we will normally pass that IV through a MIN_EXPR to calcaluate the
3006 : : basis for the length controls. E.g. in a loop that processes one
3007 : : element per scalar iteration, the number of elements would be
3008 : : MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3009 : :
3010 : : This MIN_EXPR approach allows us to use pointer IVs with an invariant
3011 : : step, since only the final iteration of the vector loop can have
3012 : : inactive lanes.
3013 : :
3014 : : However, some targets have a dedicated instruction for calculating the
3015 : : preferred length, given the total number of elements that still need to
3016 : : be processed. This is encapsulated in the SELECT_VL internal function.
3017 : :
3018 : : If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3019 : : to determine the basis for the length controls. However, unlike the
3020 : : MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3021 : : lanes inactive in any iteration of the vector loop, not just the last
3022 : : iteration. This SELECT_VL approach therefore requires us to use pointer
3023 : : IVs with variable steps.
3024 : :
3025 : : Once we've decided how many elements should be processed by one
3026 : : iteration of the vector loop, we need to populate the rgroup controls.
3027 : : If a loop has multiple rgroups, we need to make sure that those rgroups
3028 : : "line up" (that is, they must be consistent about which elements are
3029 : : active and which aren't). This is done by vect_adjust_loop_lens_control.
3030 : :
3031 : : In principle, it would be possible to use vect_adjust_loop_lens_control
3032 : : on either the result of a MIN_EXPR or the result of a SELECT_VL.
3033 : : However:
3034 : :
3035 : : (1) In practice, it only makes sense to use SELECT_VL when a vector
3036 : : operation will be controlled directly by the result. It is not
3037 : : worth using SELECT_VL if it would only be the input to other
3038 : : calculations.
3039 : :
3040 : : (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3041 : : pointer IV will need N updates by a variable amount (N-1 updates
3042 : : within the iteration and 1 update to move to the next iteration).
3043 : :
3044 : : Because of this, we prefer to use the MIN_EXPR approach whenever there
3045 : : is more than one length control.
3046 : :
3047 : : In addition, SELECT_VL always operates to a granularity of 1 unit.
3048 : : If we wanted to use it to control an SLP operation on N consecutive
3049 : : elements, we would need to make the SELECT_VL inputs measure scalar
3050 : : iterations (rather than elements) and then multiply the SELECT_VL
3051 : : result by N. But using SELECT_VL this way is inefficient because
3052 : : of (1) above.
3053 : :
3054 : : 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3055 : : satisfied:
3056 : :
3057 : : (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3058 : : (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3059 : :
3060 : : Since SELECT_VL (variable step) will make SCEV analysis failed and then
3061 : : we will fail to gain benefits of following unroll optimizations. We prefer
3062 : : using the MIN_EXPR approach in this situation. */
3063 : 116338 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3064 : : {
3065 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3066 : 0 : if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3067 : : OPTIMIZE_FOR_SPEED)
3068 : 0 : && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3069 : 0 : && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
3070 : 0 : && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3071 : : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3072 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3073 : :
3074 : : /* If any of the SLP instances cover more than a single lane
3075 : : we cannot use .SELECT_VL at the moment, even if the number
3076 : : of lanes is uniform throughout the SLP graph. */
3077 : 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
3078 : 0 : for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
3079 : 0 : if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
3080 : 0 : && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
3081 : 0 : && SLP_INSTANCE_TREE (inst)->ldst_lanes))
3082 : : {
3083 : 0 : LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
3084 : 0 : break;
3085 : : }
3086 : : }
3087 : :
3088 : : /* Decide whether this loop_vinfo should use partial vectors or peeling,
3089 : : assuming that the loop will be used as a main loop. We will redo
3090 : : this analysis later if we instead decide to use the loop as an
3091 : : epilogue loop. */
3092 : 116338 : ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3093 : 116338 : if (!ok)
3094 : 0 : return ok;
3095 : :
3096 : : /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3097 : : to be able to handle fewer than VF scalars, or needs to have a lower VF
3098 : : than the main loop. */
3099 : 116338 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3100 : 23220 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3101 : : {
3102 : 23218 : poly_uint64 unscaled_vf
3103 : 23218 : = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3104 : : orig_loop_vinfo->suggested_unroll_factor);
3105 : 23218 : if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3106 : 11657 : return opt_result::failure_at (vect_location,
3107 : : "Vectorization factor too high for"
3108 : : " epilogue loop.\n");
3109 : : }
3110 : :
3111 : : /* If the epilogue needs peeling for gaps but the main loop doesn't give
3112 : : up on the epilogue. */
3113 : 104681 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3114 : 11563 : && LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3115 : 54 : && (LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo)
3116 : : != LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
3117 : 0 : return opt_result::failure_at (vect_location,
3118 : : "Epilogue loop requires peeling for gaps "
3119 : : "but main loop does not.\n");
3120 : :
3121 : : /* If an epilogue loop is required make sure we can create one. */
3122 : 104681 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3123 : 103493 : || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3124 : 31323 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3125 : : {
3126 : 74381 : if (dump_enabled_p ())
3127 : 4817 : dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3128 : 74381 : if (!vect_can_advance_ivs_p (loop_vinfo)
3129 : 148295 : || !slpeel_can_duplicate_loop_p (loop,
3130 : : LOOP_VINFO_IV_EXIT (loop_vinfo),
3131 : 73914 : LOOP_VINFO_IV_EXIT (loop_vinfo)))
3132 : : {
3133 : 467 : ok = opt_result::failure_at (vect_location,
3134 : : "not vectorized: can't create required "
3135 : : "epilog loop\n");
3136 : 467 : goto again;
3137 : : }
3138 : : }
3139 : :
3140 : : /* Check the costings of the loop make vectorizing worthwhile. */
3141 : 104214 : res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3142 : 104214 : if (res < 0)
3143 : : {
3144 : 29164 : ok = opt_result::failure_at (vect_location,
3145 : : "Loop costings may not be worthwhile.\n");
3146 : 29164 : goto again;
3147 : : }
3148 : 75050 : if (!res)
3149 : 18279 : return opt_result::failure_at (vect_location,
3150 : : "Loop costings not worthwhile.\n");
3151 : :
3152 : : /* During peeling, we need to check if number of loop iterations is
3153 : : enough for both peeled prolog loop and vector loop. This check
3154 : : can be merged along with threshold check of loop versioning, so
3155 : : increase threshold for this case if necessary.
3156 : :
3157 : : If we are analyzing an epilogue we still want to check what its
3158 : : versioning threshold would be. If we decide to vectorize the epilogues we
3159 : : will want to use the lowest versioning threshold of all epilogues and main
3160 : : loop. This will enable us to enter a vectorized epilogue even when
3161 : : versioning the loop. We can't simply check whether the epilogue requires
3162 : : versioning though since we may have skipped some versioning checks when
3163 : : analyzing the epilogue. For instance, checks for alias versioning will be
3164 : : skipped when dealing with epilogues as we assume we already checked them
3165 : : for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3166 : 56771 : if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3167 : : {
3168 : 5449 : poly_uint64 niters_th = 0;
3169 : 5449 : unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3170 : :
3171 : 5449 : if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3172 : : {
3173 : : /* Niters for peeled prolog loop. */
3174 : 5449 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3175 : : {
3176 : 80 : dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3177 : 80 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3178 : 80 : niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3179 : : }
3180 : : else
3181 : 5369 : niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3182 : : }
3183 : :
3184 : : /* Niters for at least one iteration of vectorized loop. */
3185 : 5449 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3186 : 5449 : niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3187 : : /* One additional iteration because of peeling for gap. */
3188 : 5449 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3189 : 55 : niters_th += 1;
3190 : :
3191 : : /* Use the same condition as vect_transform_loop to decide when to use
3192 : : the cost to determine a versioning threshold. */
3193 : 5449 : if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3194 : 5449 : && ordered_p (th, niters_th))
3195 : 3694 : niters_th = ordered_max (poly_uint64 (th), niters_th);
3196 : :
3197 : 5449 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3198 : : }
3199 : :
3200 : 56771 : gcc_assert (known_eq (vectorization_factor,
3201 : : LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3202 : :
3203 : 56771 : slp_done_for_suggested_uf = slp;
3204 : :
3205 : : /* Ok to vectorize! */
3206 : 56771 : LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3207 : 56771 : return opt_result::success ();
3208 : :
3209 : 259134 : again:
3210 : : /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3211 : 259134 : gcc_assert (!ok);
3212 : :
3213 : : /* Try again with single-lane SLP. */
3214 : 259134 : if (slp == 1)
3215 : 128565 : return ok;
3216 : :
3217 : : /* If we are applying suggested unroll factor, we don't need to
3218 : : re-try any more as we want to keep the SLP mode fixed. */
3219 : 130569 : if (applying_suggested_uf)
3220 : 0 : return ok;
3221 : :
3222 : : /* If there are reduction chains re-trying will fail anyway. */
3223 : 130569 : if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3224 : 205 : return ok;
3225 : :
3226 : : /* Likewise if the grouped loads or stores in the SLP cannot be handled
3227 : : via interleaving or lane instructions. */
3228 : : slp_instance instance;
3229 : : slp_tree node;
3230 : : unsigned i, j;
3231 : 491083 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3232 : : {
3233 : 361940 : if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
3234 : 0 : continue;
3235 : :
3236 : 361940 : stmt_vec_info vinfo;
3237 : 361940 : vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3238 : 361940 : if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3239 : 359051 : continue;
3240 : 2889 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3241 : 2889 : unsigned int size = DR_GROUP_SIZE (vinfo);
3242 : 2889 : tree vectype = STMT_VINFO_VECTYPE (vinfo);
3243 : 2889 : if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3244 : 4925 : && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3245 : 5643 : && ! vect_grouped_store_supported (vectype, size))
3246 : 718 : return opt_result::failure_at (vinfo->stmt,
3247 : : "unsupported grouped store\n");
3248 : 364477 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3249 : : {
3250 : 2439 : vinfo = SLP_TREE_REPRESENTATIVE (node);
3251 : 2439 : if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3252 : : {
3253 : 2097 : vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3254 : 2097 : bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3255 : 2097 : size = DR_GROUP_SIZE (vinfo);
3256 : 2097 : vectype = STMT_VINFO_VECTYPE (vinfo);
3257 : 2097 : if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3258 : 2097 : && ! vect_grouped_load_supported (vectype, single_element_p,
3259 : : size))
3260 : 503 : return opt_result::failure_at (vinfo->stmt,
3261 : : "unsupported grouped load\n");
3262 : : }
3263 : : }
3264 : : }
3265 : :
3266 : : /* Roll back state appropriately. Force single-lane SLP this time. */
3267 : 129143 : slp = 1;
3268 : 129143 : if (dump_enabled_p ())
3269 : 3822 : dump_printf_loc (MSG_NOTE, vect_location,
3270 : : "re-trying with single-lane SLP\n");
3271 : :
3272 : : /* Restore vectorization factor as it were without SLP. */
3273 : 129143 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3274 : : /* Free the SLP instances. */
3275 : 489855 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3276 : 360712 : vect_free_slp_instance (instance);
3277 : 129143 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3278 : : /* Reset SLP type to loop_vect on all stmts. */
3279 : 492395 : for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3280 : : {
3281 : 363252 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3282 : 363252 : for (gimple_stmt_iterator si = gsi_start_phis (bb);
3283 : 665450 : !gsi_end_p (si); gsi_next (&si))
3284 : : {
3285 : 302198 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3286 : 302198 : STMT_SLP_TYPE (stmt_info) = loop_vect;
3287 : 302198 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3288 : 302198 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3289 : : {
3290 : : /* vectorizable_reduction adjusts reduction stmt def-types,
3291 : : restore them to that of the PHI. */
3292 : 18540 : STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3293 : 18540 : = STMT_VINFO_DEF_TYPE (stmt_info);
3294 : 18540 : STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3295 : : (STMT_VINFO_REDUC_DEF (stmt_info)))
3296 : 18540 : = STMT_VINFO_DEF_TYPE (stmt_info);
3297 : : }
3298 : : }
3299 : 726504 : for (gimple_stmt_iterator si = gsi_start_bb (bb);
3300 : 2210179 : !gsi_end_p (si); gsi_next (&si))
3301 : : {
3302 : 1846927 : if (is_gimple_debug (gsi_stmt (si)))
3303 : 651181 : continue;
3304 : 1195746 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3305 : 1195746 : STMT_SLP_TYPE (stmt_info) = loop_vect;
3306 : 1195746 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3307 : : {
3308 : 215985 : stmt_vec_info pattern_stmt_info
3309 : : = STMT_VINFO_RELATED_STMT (stmt_info);
3310 : 215985 : if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3311 : 0 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3312 : :
3313 : 215985 : gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3314 : 215985 : STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3315 : 215985 : for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3316 : 439800 : !gsi_end_p (pi); gsi_next (&pi))
3317 : 223815 : STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3318 : 223815 : = loop_vect;
3319 : : }
3320 : : }
3321 : : }
3322 : : /* Free optimized alias test DDRS. */
3323 : 129143 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3324 : 129143 : LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3325 : 129143 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3326 : : /* Reset target cost data. */
3327 : 129143 : delete loop_vinfo->vector_costs;
3328 : 129143 : loop_vinfo->vector_costs = nullptr;
3329 : : /* Reset accumulated rgroup information. */
3330 : 129143 : LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3331 : 129143 : release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3332 : 129143 : release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3333 : : /* Reset assorted flags. */
3334 : 129143 : LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3335 : 129143 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3336 : 129143 : LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3337 : 129143 : LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3338 : 129143 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3339 : 129143 : = saved_can_use_partial_vectors_p;
3340 : 129143 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3341 : 129143 : LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
3342 : 129143 : if (loop_vinfo->scan_map)
3343 : 122 : loop_vinfo->scan_map->empty ();
3344 : :
3345 : 129143 : goto start_over;
3346 : : }
3347 : :
3348 : : /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3349 : : to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3350 : : OLD_LOOP_VINFO is better unless something specifically indicates
3351 : : otherwise.
3352 : :
3353 : : Note that this deliberately isn't a partial order. */
3354 : :
3355 : : static bool
3356 : 0 : vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3357 : : loop_vec_info old_loop_vinfo)
3358 : : {
3359 : 0 : struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3360 : 0 : gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3361 : :
3362 : 0 : poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3363 : 0 : poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3364 : :
3365 : : /* Always prefer a VF of loop->simdlen over any other VF. */
3366 : 0 : if (loop->simdlen)
3367 : : {
3368 : 0 : bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3369 : 0 : bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3370 : 0 : if (new_simdlen_p != old_simdlen_p)
3371 : : return new_simdlen_p;
3372 : : }
3373 : :
3374 : 0 : const auto *old_costs = old_loop_vinfo->vector_costs;
3375 : 0 : const auto *new_costs = new_loop_vinfo->vector_costs;
3376 : 0 : if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3377 : 0 : return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3378 : :
3379 : 0 : return new_costs->better_main_loop_than_p (old_costs);
3380 : : }
3381 : :
3382 : : /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3383 : : true if we should. */
3384 : :
3385 : : static bool
3386 : 0 : vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3387 : : loop_vec_info old_loop_vinfo)
3388 : : {
3389 : 0 : if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3390 : : return false;
3391 : :
3392 : 0 : if (dump_enabled_p ())
3393 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3394 : : "***** Preferring vector mode %s to vector mode %s\n",
3395 : 0 : GET_MODE_NAME (new_loop_vinfo->vector_mode),
3396 : 0 : GET_MODE_NAME (old_loop_vinfo->vector_mode));
3397 : : return true;
3398 : : }
3399 : :
3400 : : /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if ORIG_LOOP_VINFO is
3401 : : not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3402 : : MODE_I to the next mode useful to analyze.
3403 : : Return the loop_vinfo on success and wrapped null on failure. */
3404 : :
3405 : : static opt_loop_vec_info
3406 : 418496 : vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3407 : : const vect_loop_form_info *loop_form_info,
3408 : : loop_vec_info orig_loop_vinfo,
3409 : : const vector_modes &vector_modes, unsigned &mode_i,
3410 : : machine_mode &autodetected_vector_mode,
3411 : : bool &fatal)
3412 : : {
3413 : 418496 : loop_vec_info loop_vinfo
3414 : 418496 : = vect_create_loop_vinfo (loop, shared, loop_form_info, orig_loop_vinfo);
3415 : :
3416 : 418496 : machine_mode vector_mode = vector_modes[mode_i];
3417 : 418496 : loop_vinfo->vector_mode = vector_mode;
3418 : 418496 : unsigned int suggested_unroll_factor = 1;
3419 : 418496 : unsigned slp_done_for_suggested_uf = 0;
3420 : :
3421 : : /* Run the main analysis. */
3422 : 418496 : opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3423 : : &suggested_unroll_factor,
3424 : : slp_done_for_suggested_uf);
3425 : 418496 : if (dump_enabled_p ())
3426 : 20872 : dump_printf_loc (MSG_NOTE, vect_location,
3427 : : "***** Analysis %s with vector mode %s\n",
3428 : 20872 : res ? "succeeded" : "failed",
3429 : 20872 : GET_MODE_NAME (loop_vinfo->vector_mode));
3430 : :
3431 : 418496 : if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) && suggested_unroll_factor > 1)
3432 : : {
3433 : 0 : if (dump_enabled_p ())
3434 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3435 : : "***** Re-trying analysis for unrolling"
3436 : : " with unroll factor %d and slp %s.\n",
3437 : : suggested_unroll_factor,
3438 : 0 : slp_done_for_suggested_uf ? "on" : "off");
3439 : 0 : loop_vec_info unroll_vinfo
3440 : 0 : = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
3441 : 0 : unroll_vinfo->vector_mode = vector_mode;
3442 : 0 : unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3443 : 0 : opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3444 : : slp_done_for_suggested_uf);
3445 : 0 : if (new_res)
3446 : : {
3447 : 0 : delete loop_vinfo;
3448 : 0 : loop_vinfo = unroll_vinfo;
3449 : : }
3450 : : else
3451 : 0 : delete unroll_vinfo;
3452 : : }
3453 : :
3454 : : /* Remember the autodetected vector mode. */
3455 : 418496 : if (vector_mode == VOIDmode)
3456 : 208756 : autodetected_vector_mode = loop_vinfo->vector_mode;
3457 : :
3458 : : /* Advance mode_i, first skipping modes that would result in the
3459 : : same analysis result. */
3460 : 1883098 : while (mode_i + 1 < vector_modes.length ()
3461 : 1317414 : && vect_chooses_same_modes_p (loop_vinfo,
3462 : 585113 : vector_modes[mode_i + 1]))
3463 : : {
3464 : 313805 : if (dump_enabled_p ())
3465 : 15448 : dump_printf_loc (MSG_NOTE, vect_location,
3466 : : "***** The result for vector mode %s would"
3467 : : " be the same\n",
3468 : 15448 : GET_MODE_NAME (vector_modes[mode_i + 1]));
3469 : 313805 : mode_i += 1;
3470 : : }
3471 : 418496 : if (mode_i + 1 < vector_modes.length ()
3472 : 271308 : && VECTOR_MODE_P (autodetected_vector_mode)
3473 : 542616 : && (related_vector_mode (vector_modes[mode_i + 1],
3474 : : GET_MODE_INNER (autodetected_vector_mode))
3475 : 271308 : == autodetected_vector_mode)
3476 : 689804 : && (related_vector_mode (autodetected_vector_mode,
3477 : 340 : GET_MODE_INNER (vector_modes[mode_i + 1]))
3478 : 680 : == vector_modes[mode_i + 1]))
3479 : : {
3480 : 340 : if (dump_enabled_p ())
3481 : 6 : dump_printf_loc (MSG_NOTE, vect_location,
3482 : : "***** Skipping vector mode %s, which would"
3483 : : " repeat the analysis for %s\n",
3484 : 6 : GET_MODE_NAME (vector_modes[mode_i + 1]),
3485 : 6 : GET_MODE_NAME (autodetected_vector_mode));
3486 : 340 : mode_i += 1;
3487 : : }
3488 : 418496 : mode_i++;
3489 : :
3490 : 418496 : if (!res)
3491 : : {
3492 : 361725 : delete loop_vinfo;
3493 : 361725 : if (fatal)
3494 : 64532 : gcc_checking_assert (orig_loop_vinfo == NULL);
3495 : 361725 : return opt_loop_vec_info::propagate_failure (res);
3496 : : }
3497 : :
3498 : 56771 : return opt_loop_vec_info::success (loop_vinfo);
3499 : : }
3500 : :
3501 : : /* Function vect_analyze_loop.
3502 : :
3503 : : Apply a set of analyses on LOOP, and create a loop_vec_info struct
3504 : : for it. The different analyses will record information in the
3505 : : loop_vec_info struct. */
3506 : : opt_loop_vec_info
3507 : 467008 : vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
3508 : : vec_info_shared *shared)
3509 : : {
3510 : 467008 : DUMP_VECT_SCOPE ("analyze_loop_nest");
3511 : :
3512 : 467008 : if (loop_outer (loop)
3513 : 467008 : && loop_vec_info_for_loop (loop_outer (loop))
3514 : 467421 : && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3515 : 413 : return opt_loop_vec_info::failure_at (vect_location,
3516 : : "outer-loop already vectorized.\n");
3517 : :
3518 : 466595 : if (!find_loop_nest (loop, &shared->loop_nest))
3519 : 21989 : return opt_loop_vec_info::failure_at
3520 : 21989 : (vect_location,
3521 : : "not vectorized: loop nest containing two or more consecutive inner"
3522 : : " loops cannot be vectorized\n");
3523 : :
3524 : : /* Analyze the loop form. */
3525 : 444606 : vect_loop_form_info loop_form_info;
3526 : 444606 : opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
3527 : : &loop_form_info);
3528 : 444606 : if (!res)
3529 : : {
3530 : 235850 : if (dump_enabled_p ())
3531 : 1622 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3532 : : "bad loop form.\n");
3533 : 235850 : return opt_loop_vec_info::propagate_failure (res);
3534 : : }
3535 : 208756 : if (!integer_onep (loop_form_info.assumptions))
3536 : : {
3537 : : /* We consider to vectorize this loop by versioning it under
3538 : : some assumptions. In order to do this, we need to clear
3539 : : existing information computed by scev and niter analyzer. */
3540 : 10098 : scev_reset_htab ();
3541 : 10098 : free_numbers_of_iterations_estimates (loop);
3542 : : /* Also set flag for this loop so that following scev and niter
3543 : : analysis are done under the assumptions. */
3544 : 10098 : loop_constraint_set (loop, LOOP_C_FINITE);
3545 : : }
3546 : : else
3547 : : /* Clear the existing niter information to make sure the nonwrapping flag
3548 : : will be calculated and set propriately. */
3549 : 198658 : free_numbers_of_iterations_estimates (loop);
3550 : :
3551 : 208756 : auto_vector_modes vector_modes;
3552 : : /* Autodetect first vector size we try. */
3553 : 208756 : vector_modes.safe_push (VOIDmode);
3554 : 208756 : unsigned int autovec_flags
3555 : 417512 : = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3556 : 208756 : loop->simdlen != 0);
3557 : 208756 : bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3558 : 208756 : && !unlimited_cost_model (loop));
3559 : 208756 : machine_mode autodetected_vector_mode = VOIDmode;
3560 : 208756 : opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3561 : 208756 : unsigned int mode_i = 0;
3562 : 208756 : unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3563 : :
3564 : : /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3565 : : a mode has not been analyzed. */
3566 : 208756 : auto_vec<poly_uint64, 8> cached_vf_per_mode;
3567 : 2103496 : for (unsigned i = 0; i < vector_modes.length (); ++i)
3568 : 842992 : cached_vf_per_mode.safe_push (0);
3569 : :
3570 : : /* First determine the main loop vectorization mode, either the first
3571 : : one that works, starting with auto-detecting the vector mode and then
3572 : : following the targets order of preference, or the one with the
3573 : : lowest cost if pick_lowest_cost_p. */
3574 : 570766 : while (1)
3575 : : {
3576 : 389761 : bool fatal;
3577 : 389761 : unsigned int last_mode_i = mode_i;
3578 : : /* Set cached VF to -1 prior to analysis, which indicates a mode has
3579 : : failed. */
3580 : 389761 : cached_vf_per_mode[last_mode_i] = -1;
3581 : 389761 : opt_loop_vec_info loop_vinfo
3582 : 389761 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3583 : : NULL, vector_modes, mode_i,
3584 : : autodetected_vector_mode, fatal);
3585 : 389761 : if (fatal)
3586 : : break;
3587 : :
3588 : 325229 : if (loop_vinfo)
3589 : : {
3590 : : /* Analyzis has been successful so update the VF value. The
3591 : : VF should always be a multiple of unroll_factor and we want to
3592 : : capture the original VF here. */
3593 : 50262 : cached_vf_per_mode[last_mode_i]
3594 : 50262 : = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3595 : 50262 : loop_vinfo->suggested_unroll_factor);
3596 : : /* Once we hit the desired simdlen for the first time,
3597 : : discard any previous attempts. */
3598 : 50262 : if (simdlen
3599 : 50262 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3600 : : {
3601 : 47 : delete first_loop_vinfo;
3602 : : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3603 : : simdlen = 0;
3604 : : }
3605 : 50215 : else if (pick_lowest_cost_p
3606 : 0 : && first_loop_vinfo
3607 : 50215 : && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3608 : : {
3609 : : /* Pick loop_vinfo over first_loop_vinfo. */
3610 : 0 : delete first_loop_vinfo;
3611 : 0 : first_loop_vinfo = opt_loop_vec_info::success (NULL);
3612 : : }
3613 : 50262 : if (first_loop_vinfo == NULL)
3614 : : first_loop_vinfo = loop_vinfo;
3615 : : else
3616 : : {
3617 : 2 : delete loop_vinfo;
3618 : 2 : loop_vinfo = opt_loop_vec_info::success (NULL);
3619 : : }
3620 : :
3621 : : /* Commit to first_loop_vinfo if we have no reason to try
3622 : : alternatives. */
3623 : 50262 : if (!simdlen && !pick_lowest_cost_p)
3624 : : break;
3625 : : }
3626 : 274976 : if (mode_i == vector_modes.length ()
3627 : 274976 : || autodetected_vector_mode == VOIDmode)
3628 : : break;
3629 : :
3630 : : /* Try the next biggest vector size. */
3631 : 181005 : if (dump_enabled_p ())
3632 : 3753 : dump_printf_loc (MSG_NOTE, vect_location,
3633 : : "***** Re-trying analysis with vector mode %s\n",
3634 : 3753 : GET_MODE_NAME (vector_modes[mode_i]));
3635 : 181005 : }
3636 : 208756 : if (!first_loop_vinfo)
3637 : 158501 : return opt_loop_vec_info::propagate_failure (res);
3638 : :
3639 : 50255 : if (dump_enabled_p ())
3640 : 8741 : dump_printf_loc (MSG_NOTE, vect_location,
3641 : : "***** Choosing vector mode %s\n",
3642 : 8741 : GET_MODE_NAME (first_loop_vinfo->vector_mode));
3643 : :
3644 : : /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3645 : : enabled, SIMDUID is not set, it is the innermost loop and we have
3646 : : either already found the loop's SIMDLEN or there was no SIMDLEN to
3647 : : begin with.
3648 : : TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3649 : 50255 : bool vect_epilogues = (!simdlen
3650 : 50253 : && loop->inner == NULL
3651 : 49826 : && param_vect_epilogues_nomask
3652 : 48791 : && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3653 : : /* No code motion support for multiple epilogues so for now
3654 : : not supported when multiple exits. */
3655 : 24192 : && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3656 : 23822 : && !loop->simduid
3657 : 72667 : && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3658 : 50255 : if (!vect_epilogues)
3659 : 38193 : return first_loop_vinfo;
3660 : :
3661 : : /* Now analyze first_loop_vinfo for epilogue vectorization. */
3662 : :
3663 : : /* For epilogues start the analysis from the first mode. The motivation
3664 : : behind starting from the beginning comes from cases where the VECTOR_MODES
3665 : : array may contain length-agnostic and length-specific modes. Their
3666 : : ordering is not guaranteed, so we could end up picking a mode for the main
3667 : : loop that is after the epilogue's optimal mode. */
3668 : 12062 : if (!unlimited_cost_model (loop)
3669 : 12062 : && first_loop_vinfo->vector_costs->suggested_epilogue_mode () != VOIDmode)
3670 : : {
3671 : 0 : vector_modes[0]
3672 : 0 : = first_loop_vinfo->vector_costs->suggested_epilogue_mode ();
3673 : 0 : cached_vf_per_mode[0] = 0;
3674 : : }
3675 : : else
3676 : 12062 : vector_modes[0] = autodetected_vector_mode;
3677 : 12062 : mode_i = 0;
3678 : :
3679 : 12062 : bool supports_partial_vectors =
3680 : 12062 : partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3681 : 12062 : poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3682 : :
3683 : 12062 : loop_vec_info orig_loop_vinfo = first_loop_vinfo;
3684 : 41838 : do
3685 : : {
3686 : 41760 : while (1)
3687 : : {
3688 : : /* If the target does not support partial vectors we can shorten the
3689 : : number of modes to analyze for the epilogue as we know we can't
3690 : : pick a mode that would lead to a VF at least as big as the
3691 : : FIRST_VINFO_VF. */
3692 : 54763 : if (!supports_partial_vectors
3693 : 41760 : && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3694 : : {
3695 : 13025 : mode_i++;
3696 : 26050 : if (mode_i == vector_modes.length ())
3697 : : break;
3698 : 13003 : continue;
3699 : : }
3700 : :
3701 : 28735 : if (dump_enabled_p ())
3702 : 5067 : dump_printf_loc (MSG_NOTE, vect_location,
3703 : : "***** Re-trying epilogue analysis with vector "
3704 : 5067 : "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3705 : :
3706 : 28735 : bool fatal;
3707 : 28735 : opt_loop_vec_info loop_vinfo
3708 : 28735 : = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3709 : : orig_loop_vinfo,
3710 : : vector_modes, mode_i,
3711 : : autodetected_vector_mode, fatal);
3712 : 28735 : if (fatal)
3713 : : break;
3714 : :
3715 : 28735 : if (loop_vinfo)
3716 : : {
3717 : 6509 : if (pick_lowest_cost_p
3718 : 0 : && orig_loop_vinfo->epilogue_vinfo
3719 : 6509 : && vect_joust_loop_vinfos (loop_vinfo,
3720 : 0 : orig_loop_vinfo->epilogue_vinfo))
3721 : : {
3722 : 0 : gcc_assert (vect_epilogues);
3723 : 0 : delete orig_loop_vinfo->epilogue_vinfo;
3724 : 0 : orig_loop_vinfo->epilogue_vinfo = nullptr;
3725 : : }
3726 : 6509 : if (!orig_loop_vinfo->epilogue_vinfo)
3727 : 6509 : orig_loop_vinfo->epilogue_vinfo = loop_vinfo;
3728 : : else
3729 : : {
3730 : 0 : delete loop_vinfo;
3731 : 0 : loop_vinfo = opt_loop_vec_info::success (NULL);
3732 : : }
3733 : :
3734 : : /* For now only allow one epilogue loop, but allow
3735 : : pick_lowest_cost_p to replace it, so commit to the
3736 : : first epilogue if we have no reason to try alternatives. */
3737 : 6509 : if (!pick_lowest_cost_p)
3738 : : break;
3739 : : }
3740 : :
3741 : 44452 : if (mode_i == vector_modes.length ())
3742 : : break;
3743 : : }
3744 : :
3745 : 12140 : orig_loop_vinfo = orig_loop_vinfo->epilogue_vinfo;
3746 : 12140 : if (!orig_loop_vinfo)
3747 : : break;
3748 : :
3749 : : /* When we selected a first vectorized epilogue, see if the target
3750 : : suggests to have another one. */
3751 : 6509 : if (!unlimited_cost_model (loop)
3752 : 6509 : && (orig_loop_vinfo->vector_costs->suggested_epilogue_mode ()
3753 : : != VOIDmode))
3754 : : {
3755 : 156 : vector_modes[0]
3756 : 78 : = orig_loop_vinfo->vector_costs->suggested_epilogue_mode ();
3757 : 78 : cached_vf_per_mode[0] = 0;
3758 : 78 : mode_i = 0;
3759 : : }
3760 : : else
3761 : : break;
3762 : 78 : }
3763 : : while (1);
3764 : :
3765 : 12062 : if (first_loop_vinfo->epilogue_vinfo)
3766 : : {
3767 : 6432 : poly_uint64 lowest_th
3768 : 6432 : = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3769 : 6432 : loop_vec_info epilog_vinfo = first_loop_vinfo->epilogue_vinfo;
3770 : 6509 : do
3771 : : {
3772 : 6509 : poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (epilog_vinfo);
3773 : 6509 : gcc_assert (!LOOP_REQUIRES_VERSIONING (epilog_vinfo)
3774 : : || maybe_ne (lowest_th, 0U));
3775 : : /* Keep track of the known smallest versioning threshold. */
3776 : 6509 : if (ordered_p (lowest_th, th))
3777 : 6509 : lowest_th = ordered_min (lowest_th, th);
3778 : 6509 : epilog_vinfo = epilog_vinfo->epilogue_vinfo;
3779 : : }
3780 : 6509 : while (epilog_vinfo);
3781 : 6432 : LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3782 : 6432 : if (dump_enabled_p ())
3783 : 1275 : dump_printf_loc (MSG_NOTE, vect_location,
3784 : : "***** Choosing epilogue vector mode %s\n",
3785 : 1275 : GET_MODE_NAME
3786 : : (first_loop_vinfo->epilogue_vinfo->vector_mode));
3787 : : }
3788 : :
3789 : 12062 : return first_loop_vinfo;
3790 : 653362 : }
3791 : :
3792 : : /* Return true if there is an in-order reduction function for CODE, storing
3793 : : it in *REDUC_FN if so. */
3794 : :
3795 : : static bool
3796 : 5239 : fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3797 : : {
3798 : : /* We support MINUS_EXPR by negating the operand. This also preserves an
3799 : : initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3800 : : (-0.0) = -0.0. */
3801 : 5239 : if (code == PLUS_EXPR || code == MINUS_EXPR)
3802 : : {
3803 : 4561 : *reduc_fn = IFN_FOLD_LEFT_PLUS;
3804 : 0 : return true;
3805 : : }
3806 : : return false;
3807 : : }
3808 : :
3809 : : /* Function reduction_fn_for_scalar_code
3810 : :
3811 : : Input:
3812 : : CODE - tree_code of a reduction operations.
3813 : :
3814 : : Output:
3815 : : REDUC_FN - the corresponding internal function to be used to reduce the
3816 : : vector of partial results into a single scalar result, or IFN_LAST
3817 : : if the operation is a supported reduction operation, but does not have
3818 : : such an internal function.
3819 : :
3820 : : Return FALSE if CODE currently cannot be vectorized as reduction. */
3821 : :
3822 : : bool
3823 : 1982588 : reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3824 : : {
3825 : 1982588 : if (code.is_tree_code ())
3826 : 1982534 : switch (tree_code (code))
3827 : : {
3828 : 14541 : case MAX_EXPR:
3829 : 14541 : *reduc_fn = IFN_REDUC_MAX;
3830 : 14541 : return true;
3831 : :
3832 : 50919 : case MIN_EXPR:
3833 : 50919 : *reduc_fn = IFN_REDUC_MIN;
3834 : 50919 : return true;
3835 : :
3836 : 1062397 : case PLUS_EXPR:
3837 : 1062397 : *reduc_fn = IFN_REDUC_PLUS;
3838 : 1062397 : return true;
3839 : :
3840 : 251218 : case BIT_AND_EXPR:
3841 : 251218 : *reduc_fn = IFN_REDUC_AND;
3842 : 251218 : return true;
3843 : :
3844 : 291246 : case BIT_IOR_EXPR:
3845 : 291246 : *reduc_fn = IFN_REDUC_IOR;
3846 : 291246 : return true;
3847 : :
3848 : 40296 : case BIT_XOR_EXPR:
3849 : 40296 : *reduc_fn = IFN_REDUC_XOR;
3850 : 40296 : return true;
3851 : :
3852 : 271917 : case MULT_EXPR:
3853 : 271917 : case MINUS_EXPR:
3854 : 271917 : *reduc_fn = IFN_LAST;
3855 : 271917 : return true;
3856 : :
3857 : : default:
3858 : : return false;
3859 : : }
3860 : : else
3861 : 54 : switch (combined_fn (code))
3862 : : {
3863 : 30 : CASE_CFN_FMAX:
3864 : 30 : *reduc_fn = IFN_REDUC_FMAX;
3865 : 30 : return true;
3866 : :
3867 : 24 : CASE_CFN_FMIN:
3868 : 24 : *reduc_fn = IFN_REDUC_FMIN;
3869 : 24 : return true;
3870 : :
3871 : : default:
3872 : : return false;
3873 : : }
3874 : : }
3875 : :
3876 : : /* If there is a neutral value X such that a reduction would not be affected
3877 : : by the introduction of additional X elements, return that X, otherwise
3878 : : return null. CODE is the code of the reduction and SCALAR_TYPE is type
3879 : : of the scalar elements. If the reduction has just a single initial value
3880 : : then INITIAL_VALUE is that value, otherwise it is null.
3881 : : If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3882 : : In that case no signed zero is returned. */
3883 : :
3884 : : tree
3885 : 80741 : neutral_op_for_reduction (tree scalar_type, code_helper code,
3886 : : tree initial_value, bool as_initial)
3887 : : {
3888 : 80741 : if (code.is_tree_code ())
3889 : 80687 : switch (tree_code (code))
3890 : : {
3891 : 10211 : case DOT_PROD_EXPR:
3892 : 10211 : case SAD_EXPR:
3893 : 10211 : case MINUS_EXPR:
3894 : 10211 : case BIT_IOR_EXPR:
3895 : 10211 : case BIT_XOR_EXPR:
3896 : 10211 : return build_zero_cst (scalar_type);
3897 : 64671 : case WIDEN_SUM_EXPR:
3898 : 64671 : case PLUS_EXPR:
3899 : 64671 : if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3900 : 15 : return build_real (scalar_type, dconstm0);
3901 : : else
3902 : 64656 : return build_zero_cst (scalar_type);
3903 : :
3904 : 1933 : case MULT_EXPR:
3905 : 1933 : return build_one_cst (scalar_type);
3906 : :
3907 : 1350 : case BIT_AND_EXPR:
3908 : 1350 : return build_all_ones_cst (scalar_type);
3909 : :
3910 : : case MAX_EXPR:
3911 : : case MIN_EXPR:
3912 : : return initial_value;
3913 : :
3914 : 387 : default:
3915 : 387 : return NULL_TREE;
3916 : : }
3917 : : else
3918 : 54 : switch (combined_fn (code))
3919 : : {
3920 : : CASE_CFN_FMIN:
3921 : : CASE_CFN_FMAX:
3922 : : return initial_value;
3923 : :
3924 : 0 : default:
3925 : 0 : return NULL_TREE;
3926 : : }
3927 : : }
3928 : :
3929 : : /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3930 : : STMT is printed with a message MSG. */
3931 : :
3932 : : static void
3933 : 471 : report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3934 : : {
3935 : 471 : dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3936 : 471 : }
3937 : :
3938 : : /* Return true if we need an in-order reduction for operation CODE
3939 : : on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3940 : : overflow must wrap. */
3941 : :
3942 : : bool
3943 : 6298699 : needs_fold_left_reduction_p (tree type, code_helper code)
3944 : : {
3945 : : /* CHECKME: check for !flag_finite_math_only too? */
3946 : 6298699 : if (SCALAR_FLOAT_TYPE_P (type))
3947 : : {
3948 : 532855 : if (code.is_tree_code ())
3949 : 532806 : switch (tree_code (code))
3950 : : {
3951 : : case MIN_EXPR:
3952 : : case MAX_EXPR:
3953 : : return false;
3954 : :
3955 : 531297 : default:
3956 : 531297 : return !flag_associative_math;
3957 : : }
3958 : : else
3959 : 49 : switch (combined_fn (code))
3960 : : {
3961 : : CASE_CFN_FMIN:
3962 : : CASE_CFN_FMAX:
3963 : : return false;
3964 : :
3965 : 1 : default:
3966 : 1 : return !flag_associative_math;
3967 : : }
3968 : : }
3969 : :
3970 : 5765844 : if (INTEGRAL_TYPE_P (type))
3971 : 5765035 : return (!code.is_tree_code ()
3972 : 5765035 : || !operation_no_trapping_overflow (type, tree_code (code)));
3973 : :
3974 : 809 : if (SAT_FIXED_POINT_TYPE_P (type))
3975 : : return true;
3976 : :
3977 : : return false;
3978 : : }
3979 : :
3980 : : /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3981 : : has a handled computation expression. Store the main reduction
3982 : : operation in *CODE. */
3983 : :
3984 : : static bool
3985 : 70281 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3986 : : tree loop_arg, code_helper *code,
3987 : : vec<std::pair<ssa_op_iter, use_operand_p> > &path,
3988 : : bool inner_loop_of_double_reduc)
3989 : : {
3990 : 70281 : auto_bitmap visited;
3991 : 70281 : tree lookfor = PHI_RESULT (phi);
3992 : 70281 : ssa_op_iter curri;
3993 : 70281 : use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3994 : 147364 : while (USE_FROM_PTR (curr) != loop_arg)
3995 : 6802 : curr = op_iter_next_use (&curri);
3996 : 70281 : curri.i = curri.numops;
3997 : 697908 : do
3998 : : {
3999 : 697908 : path.safe_push (std::make_pair (curri, curr));
4000 : 697908 : tree use = USE_FROM_PTR (curr);
4001 : 697908 : if (use == lookfor)
4002 : : break;
4003 : 627800 : gimple *def = SSA_NAME_DEF_STMT (use);
4004 : 627800 : if (gimple_nop_p (def)
4005 : 627800 : || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4006 : : {
4007 : 536294 : pop:
4008 : 536294 : do
4009 : : {
4010 : 536294 : std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4011 : 536294 : curri = x.first;
4012 : 536294 : curr = x.second;
4013 : 585229 : do
4014 : 585229 : curr = op_iter_next_use (&curri);
4015 : : /* Skip already visited or non-SSA operands (from iterating
4016 : : over PHI args). */
4017 : : while (curr != NULL_USE_OPERAND_P
4018 : 1170458 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4019 : 201310 : || ! bitmap_set_bit (visited,
4020 : 201310 : SSA_NAME_VERSION
4021 : : (USE_FROM_PTR (curr)))));
4022 : : }
4023 : 1072588 : while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4024 : 179129 : if (curr == NULL_USE_OPERAND_P)
4025 : : break;
4026 : : }
4027 : : else
4028 : : {
4029 : 522969 : if (gimple_code (def) == GIMPLE_PHI)
4030 : 52183 : curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4031 : : else
4032 : 470786 : curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4033 : : while (curr != NULL_USE_OPERAND_P
4034 : 624251 : && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4035 : 546110 : || ! bitmap_set_bit (visited,
4036 : 546110 : SSA_NAME_VERSION
4037 : : (USE_FROM_PTR (curr)))))
4038 : 101282 : curr = op_iter_next_use (&curri);
4039 : 522969 : if (curr == NULL_USE_OPERAND_P)
4040 : 74298 : goto pop;
4041 : : }
4042 : : }
4043 : : while (1);
4044 : 70281 : if (dump_file && (dump_flags & TDF_DETAILS))
4045 : : {
4046 : 4069 : dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4047 : 4069 : unsigned i;
4048 : 4069 : std::pair<ssa_op_iter, use_operand_p> *x;
4049 : 13893 : FOR_EACH_VEC_ELT (path, i, x)
4050 : 9824 : dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4051 : 4069 : dump_printf (MSG_NOTE, "\n");
4052 : : }
4053 : :
4054 : : /* Check whether the reduction path detected is valid. */
4055 : 70281 : bool fail = path.length () == 0;
4056 : 70281 : bool neg = false;
4057 : 70281 : int sign = -1;
4058 : 70281 : *code = ERROR_MARK;
4059 : 152923 : for (unsigned i = 1; i < path.length (); ++i)
4060 : : {
4061 : 85579 : gimple *use_stmt = USE_STMT (path[i].second);
4062 : 85579 : gimple_match_op op;
4063 : 85579 : if (!gimple_extract_op (use_stmt, &op))
4064 : : {
4065 : : fail = true;
4066 : 2937 : break;
4067 : : }
4068 : 85027 : unsigned int opi = op.num_ops;
4069 : 85027 : if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4070 : : {
4071 : : /* The following make sure we can compute the operand index
4072 : : easily plus it mostly disallows chaining via COND_EXPR condition
4073 : : operands. */
4074 : 135194 : for (opi = 0; opi < op.num_ops; ++opi)
4075 : 134255 : if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4076 : : break;
4077 : : }
4078 : 3208 : else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4079 : : {
4080 : 6410 : for (opi = 0; opi < op.num_ops; ++opi)
4081 : 6410 : if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4082 : : break;
4083 : : }
4084 : 85027 : if (opi == op.num_ops)
4085 : : {
4086 : : fail = true;
4087 : : break;
4088 : : }
4089 : 84088 : op.code = canonicalize_code (op.code, op.type);
4090 : 84088 : if (op.code == MINUS_EXPR)
4091 : : {
4092 : 5132 : op.code = PLUS_EXPR;
4093 : : /* Track whether we negate the reduction value each iteration. */
4094 : 5132 : if (op.ops[1] == op.ops[opi])
4095 : 36 : neg = ! neg;
4096 : : }
4097 : 78956 : else if (op.code == IFN_COND_SUB)
4098 : : {
4099 : 3 : op.code = IFN_COND_ADD;
4100 : : /* Track whether we negate the reduction value each iteration. */
4101 : 3 : if (op.ops[2] == op.ops[opi])
4102 : 0 : neg = ! neg;
4103 : : }
4104 : 84088 : if (CONVERT_EXPR_CODE_P (op.code)
4105 : 84088 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4106 : : ;
4107 : 79629 : else if (*code == ERROR_MARK)
4108 : : {
4109 : 68727 : *code = op.code;
4110 : 68727 : sign = TYPE_SIGN (op.type);
4111 : : }
4112 : 10902 : else if (op.code != *code)
4113 : : {
4114 : : fail = true;
4115 : : break;
4116 : : }
4117 : 9666 : else if ((op.code == MIN_EXPR
4118 : 9522 : || op.code == MAX_EXPR)
4119 : 9673 : && sign != TYPE_SIGN (op.type))
4120 : : {
4121 : : fail = true;
4122 : : break;
4123 : : }
4124 : : /* Check there's only a single stmt the op is used on. For the
4125 : : not value-changing tail and the last stmt allow out-of-loop uses,
4126 : : but not when this is the inner loop of a double reduction.
4127 : : ??? We could relax this and handle arbitrary live stmts by
4128 : : forcing a scalar epilogue for example. */
4129 : 82849 : imm_use_iterator imm_iter;
4130 : 82849 : use_operand_p use_p;
4131 : 82849 : gimple *op_use_stmt;
4132 : 82849 : unsigned cnt = 0;
4133 : 86055 : bool cond_fn_p = op.code.is_internal_fn ()
4134 : 3206 : && (conditional_internal_fn_code (internal_fn (op.code))
4135 : 82849 : != ERROR_MARK);
4136 : :
4137 : 193860 : FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4138 : : {
4139 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
4140 : : have op1 twice (once as definition, once as else) in the same
4141 : : operation. Enforce this. */
4142 : 111011 : if (cond_fn_p && op_use_stmt == use_stmt)
4143 : : {
4144 : 3150 : gcall *call = as_a<gcall *> (use_stmt);
4145 : 3150 : unsigned else_pos
4146 : 3150 : = internal_fn_else_index (internal_fn (op.code));
4147 : 3150 : if (gimple_call_arg (call, else_pos) != op.ops[opi])
4148 : : {
4149 : : fail = true;
4150 : : break;
4151 : : }
4152 : 15750 : for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4153 : : {
4154 : 12600 : if (j == else_pos)
4155 : 3150 : continue;
4156 : 9450 : if (gimple_call_arg (call, j) == op.ops[opi])
4157 : 3150 : cnt++;
4158 : : }
4159 : : }
4160 : 107861 : else if (!is_gimple_debug (op_use_stmt)
4161 : 107861 : && ((*code != ERROR_MARK || inner_loop_of_double_reduc)
4162 : 2280 : || flow_bb_inside_loop_p (loop,
4163 : 2280 : gimple_bb (op_use_stmt))))
4164 : 159955 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4165 : 79982 : cnt++;
4166 : 82849 : }
4167 : :
4168 : 82849 : if (cnt != 1)
4169 : : {
4170 : : fail = true;
4171 : : break;
4172 : : }
4173 : : }
4174 : 73400 : return ! fail && ! neg && *code != ERROR_MARK;
4175 : 70281 : }
4176 : :
4177 : : bool
4178 : 19 : check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4179 : : tree loop_arg, enum tree_code code)
4180 : : {
4181 : 19 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4182 : 19 : code_helper code_;
4183 : 19 : return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path, false)
4184 : 19 : && code_ == code);
4185 : 19 : }
4186 : :
4187 : :
4188 : :
4189 : : /* Function vect_is_simple_reduction
4190 : :
4191 : : (1) Detect a cross-iteration def-use cycle that represents a simple
4192 : : reduction computation. We look for the following pattern:
4193 : :
4194 : : loop_header:
4195 : : a1 = phi < a0, a2 >
4196 : : a3 = ...
4197 : : a2 = operation (a3, a1)
4198 : :
4199 : : or
4200 : :
4201 : : a3 = ...
4202 : : loop_header:
4203 : : a1 = phi < a0, a2 >
4204 : : a2 = operation (a3, a1)
4205 : :
4206 : : such that:
4207 : : 1. operation is commutative and associative and it is safe to
4208 : : change the order of the computation
4209 : : 2. no uses for a2 in the loop (a2 is used out of the loop)
4210 : : 3. no uses of a1 in the loop besides the reduction operation
4211 : : 4. no uses of a1 outside the loop.
4212 : :
4213 : : Conditions 1,4 are tested here.
4214 : : Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4215 : :
4216 : : (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4217 : : nested cycles.
4218 : :
4219 : : (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4220 : : reductions:
4221 : :
4222 : : a1 = phi < a0, a2 >
4223 : : inner loop (def of a3)
4224 : : a2 = phi < a3 >
4225 : :
4226 : : (4) Detect condition expressions, ie:
4227 : : for (int i = 0; i < N; i++)
4228 : : if (a[i] < val)
4229 : : ret_val = a[i];
4230 : :
4231 : : */
4232 : :
4233 : : static stmt_vec_info
4234 : 92220 : vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4235 : : bool *double_reduc, bool *reduc_chain_p, bool slp)
4236 : : {
4237 : 92220 : gphi *phi = as_a <gphi *> (phi_info->stmt);
4238 : 92220 : gimple *phi_use_stmt = NULL;
4239 : 92220 : imm_use_iterator imm_iter;
4240 : 92220 : use_operand_p use_p;
4241 : :
4242 : 92220 : *double_reduc = false;
4243 : 92220 : *reduc_chain_p = false;
4244 : 92220 : STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4245 : :
4246 : 92220 : tree phi_name = PHI_RESULT (phi);
4247 : : /* ??? If there are no uses of the PHI result the inner loop reduction
4248 : : won't be detected as possibly double-reduction by vectorizable_reduction
4249 : : because that tries to walk the PHI arg from the preheader edge which
4250 : : can be constant. See PR60382. */
4251 : 92220 : if (has_zero_uses (phi_name))
4252 : : return NULL;
4253 : 92117 : class loop *loop = (gimple_bb (phi))->loop_father;
4254 : 92117 : unsigned nphi_def_loop_uses = 0;
4255 : 227603 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4256 : : {
4257 : 140031 : gimple *use_stmt = USE_STMT (use_p);
4258 : 140031 : if (is_gimple_debug (use_stmt))
4259 : 34560 : continue;
4260 : :
4261 : 105471 : if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4262 : : {
4263 : 4545 : if (dump_enabled_p ())
4264 : 53 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4265 : : "intermediate value used outside loop.\n");
4266 : :
4267 : 4545 : return NULL;
4268 : : }
4269 : :
4270 : : /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4271 : : op1 twice (once as definition, once as else) in the same operation.
4272 : : Only count it as one. */
4273 : 100926 : if (use_stmt != phi_use_stmt)
4274 : : {
4275 : 97415 : nphi_def_loop_uses++;
4276 : 97415 : phi_use_stmt = use_stmt;
4277 : : }
4278 : : }
4279 : :
4280 : 87572 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4281 : 87572 : if (TREE_CODE (latch_def) != SSA_NAME)
4282 : : {
4283 : 1210 : if (dump_enabled_p ())
4284 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4285 : : "reduction: not ssa_name: %T\n", latch_def);
4286 : 1210 : return NULL;
4287 : : }
4288 : :
4289 : 86362 : stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4290 : 86362 : if (!def_stmt_info
4291 : 86362 : || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4292 : 135 : return NULL;
4293 : :
4294 : 86227 : bool nested_in_vect_loop
4295 : 86227 : = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4296 : 86227 : unsigned nlatch_def_loop_uses = 0;
4297 : 86227 : auto_vec<gphi *, 3> lcphis;
4298 : 86227 : bool inner_loop_of_double_reduc = false;
4299 : 327545 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4300 : : {
4301 : 241318 : gimple *use_stmt = USE_STMT (use_p);
4302 : 241318 : if (is_gimple_debug (use_stmt))
4303 : 69191 : continue;
4304 : 172127 : if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4305 : 94667 : nlatch_def_loop_uses++;
4306 : : else
4307 : : {
4308 : : /* We can have more than one loop-closed PHI. */
4309 : 77460 : lcphis.safe_push (as_a <gphi *> (use_stmt));
4310 : 77460 : if (nested_in_vect_loop
4311 : 77460 : && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4312 : : == vect_double_reduction_def))
4313 : : inner_loop_of_double_reduc = true;
4314 : : }
4315 : : }
4316 : :
4317 : : /* If we are vectorizing an inner reduction we are executing that
4318 : : in the original order only in case we are not dealing with a
4319 : : double reduction. */
4320 : 86227 : if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4321 : : {
4322 : 2137 : if (dump_enabled_p ())
4323 : 370 : report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4324 : : "detected nested cycle: ");
4325 : 2137 : return def_stmt_info;
4326 : : }
4327 : :
4328 : : /* When the inner loop of a double reduction ends up with more than
4329 : : one loop-closed PHI we have failed to classify alternate such
4330 : : PHIs as double reduction, leading to wrong code. See PR103237. */
4331 : 85044 : if (inner_loop_of_double_reduc && lcphis.length () != 1)
4332 : : {
4333 : 1 : if (dump_enabled_p ())
4334 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4335 : : "unhandle double reduction\n");
4336 : 1 : return NULL;
4337 : : }
4338 : :
4339 : : /* If this isn't a nested cycle or if the nested cycle reduction value
4340 : : is used ouside of the inner loop we cannot handle uses of the reduction
4341 : : value. */
4342 : 84089 : if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4343 : : {
4344 : 12695 : if (dump_enabled_p ())
4345 : 311 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4346 : : "reduction used in loop.\n");
4347 : 12695 : return NULL;
4348 : : }
4349 : :
4350 : : /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4351 : : defined in the inner loop. */
4352 : 71394 : if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4353 : : {
4354 : 1132 : tree op1 = PHI_ARG_DEF (def_stmt, 0);
4355 : 1132 : if (gimple_phi_num_args (def_stmt) != 1
4356 : 1132 : || TREE_CODE (op1) != SSA_NAME)
4357 : : {
4358 : 52 : if (dump_enabled_p ())
4359 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4360 : : "unsupported phi node definition.\n");
4361 : :
4362 : 52 : return NULL;
4363 : : }
4364 : :
4365 : : /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4366 : : and the latch definition op1. */
4367 : 1080 : gimple *def1 = SSA_NAME_DEF_STMT (op1);
4368 : 1080 : if (gimple_bb (def1)
4369 : 1080 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4370 : 1080 : && loop->inner
4371 : 1072 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4372 : 1072 : && (is_gimple_assign (def1) || is_gimple_call (def1))
4373 : 1063 : && is_a <gphi *> (phi_use_stmt)
4374 : 1052 : && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4375 : 1052 : && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4376 : : loop_latch_edge (loop->inner)))
4377 : 2130 : && lcphis.length () == 1)
4378 : : {
4379 : 966 : if (dump_enabled_p ())
4380 : 101 : report_vect_op (MSG_NOTE, def_stmt,
4381 : : "detected double reduction: ");
4382 : :
4383 : 966 : *double_reduc = true;
4384 : 966 : return def_stmt_info;
4385 : : }
4386 : :
4387 : 114 : return NULL;
4388 : : }
4389 : :
4390 : : /* Look for the expression computing latch_def from then loop PHI result. */
4391 : 70262 : auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4392 : 70262 : code_helper code;
4393 : 70262 : if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4394 : : path, inner_loop_of_double_reduc))
4395 : : {
4396 : 67143 : STMT_VINFO_REDUC_CODE (phi_info) = code;
4397 : 67143 : if (code == COND_EXPR && !nested_in_vect_loop)
4398 : 4136 : STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4399 : :
4400 : : /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4401 : : reduction chain for which the additional restriction is that
4402 : : all operations in the chain are the same. */
4403 : 67143 : auto_vec<stmt_vec_info, 8> reduc_chain;
4404 : 67143 : unsigned i;
4405 : 67143 : bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4406 : 215356 : for (i = path.length () - 1; i >= 1; --i)
4407 : : {
4408 : 81070 : gimple *stmt = USE_STMT (path[i].second);
4409 : 81070 : stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4410 : 81070 : gimple_match_op op;
4411 : 81070 : if (!gimple_extract_op (stmt, &op))
4412 : 0 : gcc_unreachable ();
4413 : 81070 : if (gassign *assign = dyn_cast<gassign *> (stmt))
4414 : 77882 : STMT_VINFO_REDUC_IDX (stmt_info)
4415 : 77882 : = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4416 : : else
4417 : : {
4418 : 3188 : gcall *call = as_a<gcall *> (stmt);
4419 : 3188 : STMT_VINFO_REDUC_IDX (stmt_info)
4420 : 3188 : = path[i].second->use - gimple_call_arg_ptr (call, 0);
4421 : : }
4422 : 81070 : bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4423 : 81070 : && (i == 1 || i == path.length () - 1));
4424 : 9409 : if ((op.code != code && !leading_conversion)
4425 : : /* We can only handle the final value in epilogue
4426 : : generation for reduction chains. */
4427 : 85427 : || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4428 : : is_slp_reduc = false;
4429 : : /* For reduction chains we support a trailing/leading
4430 : : conversions. We do not store those in the actual chain. */
4431 : 81070 : if (leading_conversion)
4432 : 4357 : continue;
4433 : 76713 : reduc_chain.safe_push (stmt_info);
4434 : : }
4435 : 124213 : if (slp && is_slp_reduc && reduc_chain.length () > 1)
4436 : : {
4437 : 3673 : for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4438 : : {
4439 : 2846 : REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4440 : 2846 : REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4441 : : }
4442 : 827 : REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4443 : 827 : REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4444 : :
4445 : : /* Save the chain for further analysis in SLP detection. */
4446 : 827 : LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4447 : 1654 : REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4448 : :
4449 : 827 : *reduc_chain_p = true;
4450 : 827 : if (dump_enabled_p ())
4451 : 298 : dump_printf_loc (MSG_NOTE, vect_location,
4452 : : "reduction: detected reduction chain\n");
4453 : : }
4454 : 66316 : else if (dump_enabled_p ())
4455 : 3699 : dump_printf_loc (MSG_NOTE, vect_location,
4456 : : "reduction: detected reduction\n");
4457 : :
4458 : 67143 : return def_stmt_info;
4459 : 67143 : }
4460 : :
4461 : 3119 : if (dump_enabled_p ())
4462 : 91 : dump_printf_loc (MSG_NOTE, vect_location,
4463 : : "reduction: unknown pattern\n");
4464 : :
4465 : : return NULL;
4466 : 156489 : }
4467 : :
4468 : : /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4469 : : PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4470 : : or -1 if not known. */
4471 : :
4472 : : static int
4473 : 360375 : vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4474 : : {
4475 : 360375 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
4476 : 360375 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4477 : : {
4478 : 140488 : if (dump_enabled_p ())
4479 : 2737 : dump_printf_loc (MSG_NOTE, vect_location,
4480 : : "cost model: epilogue peel iters set to vf/2 "
4481 : : "because loop iterations are unknown .\n");
4482 : 140488 : return assumed_vf / 2;
4483 : : }
4484 : : else
4485 : : {
4486 : 219887 : int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4487 : 219887 : peel_iters_prologue = MIN (niters, peel_iters_prologue);
4488 : 219887 : int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4489 : : /* If we need to peel for gaps, but no peeling is required, we have to
4490 : : peel VF iterations. */
4491 : 219887 : if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4492 : 219887 : peel_iters_epilogue = assumed_vf;
4493 : 219887 : return peel_iters_epilogue;
4494 : : }
4495 : : }
4496 : :
4497 : : /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4498 : : int
4499 : 281958 : vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4500 : : int *peel_iters_epilogue,
4501 : : stmt_vector_for_cost *scalar_cost_vec,
4502 : : stmt_vector_for_cost *prologue_cost_vec,
4503 : : stmt_vector_for_cost *epilogue_cost_vec)
4504 : : {
4505 : 281958 : int retval = 0;
4506 : :
4507 : 281958 : *peel_iters_epilogue
4508 : 281958 : = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4509 : :
4510 : 281958 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4511 : : {
4512 : : /* If peeled iterations are known but number of scalar loop
4513 : : iterations are unknown, count a taken branch per peeled loop. */
4514 : 93900 : if (peel_iters_prologue > 0)
4515 : 54363 : retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4516 : : vect_prologue);
4517 : 93900 : if (*peel_iters_epilogue > 0)
4518 : 93828 : retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4519 : : vect_epilogue);
4520 : : }
4521 : :
4522 : 281958 : stmt_info_for_cost *si;
4523 : 281958 : int j;
4524 : 281958 : if (peel_iters_prologue)
4525 : 685017 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4526 : 569404 : retval += record_stmt_cost (prologue_cost_vec,
4527 : 569404 : si->count * peel_iters_prologue,
4528 : : si->kind, si->stmt_info, si->misalign,
4529 : : vect_prologue);
4530 : 281958 : if (*peel_iters_epilogue)
4531 : 1113374 : FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4532 : 924170 : retval += record_stmt_cost (epilogue_cost_vec,
4533 : 924170 : si->count * *peel_iters_epilogue,
4534 : : si->kind, si->stmt_info, si->misalign,
4535 : : vect_epilogue);
4536 : :
4537 : 281958 : return retval;
4538 : : }
4539 : :
4540 : : /* Function vect_estimate_min_profitable_iters
4541 : :
4542 : : Return the number of iterations required for the vector version of the
4543 : : loop to be profitable relative to the cost of the scalar version of the
4544 : : loop.
4545 : :
4546 : : *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4547 : : of iterations for vectorization. -1 value means loop vectorization
4548 : : is not profitable. This returned value may be used for dynamic
4549 : : profitability check.
4550 : :
4551 : : *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4552 : : for static check against estimated number of iterations. */
4553 : :
4554 : : static void
4555 : 94367 : vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4556 : : int *ret_min_profitable_niters,
4557 : : int *ret_min_profitable_estimate,
4558 : : unsigned *suggested_unroll_factor)
4559 : : {
4560 : 94367 : int min_profitable_iters;
4561 : 94367 : int min_profitable_estimate;
4562 : 94367 : int peel_iters_prologue;
4563 : 94367 : int peel_iters_epilogue;
4564 : 94367 : unsigned vec_inside_cost = 0;
4565 : 94367 : int vec_outside_cost = 0;
4566 : 94367 : unsigned vec_prologue_cost = 0;
4567 : 94367 : unsigned vec_epilogue_cost = 0;
4568 : 94367 : int scalar_single_iter_cost = 0;
4569 : 94367 : int scalar_outside_cost = 0;
4570 : 94367 : int assumed_vf = vect_vf_for_cost (loop_vinfo);
4571 : 94367 : int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4572 : 94367 : vector_costs *target_cost_data = loop_vinfo->vector_costs;
4573 : :
4574 : : /* Cost model disabled. */
4575 : 94367 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4576 : : {
4577 : 15828 : if (dump_enabled_p ())
4578 : 9640 : dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4579 : 15828 : *ret_min_profitable_niters = 0;
4580 : 15828 : *ret_min_profitable_estimate = 0;
4581 : 15828 : return;
4582 : : }
4583 : :
4584 : : /* Requires loop versioning tests to handle misalignment. */
4585 : 78539 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4586 : : {
4587 : : /* FIXME: Make cost depend on complexity of individual check. */
4588 : 16 : unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4589 : 16 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4590 : 16 : if (dump_enabled_p ())
4591 : 1 : dump_printf (MSG_NOTE,
4592 : : "cost model: Adding cost of checks for loop "
4593 : : "versioning to treat misalignment.\n");
4594 : : }
4595 : :
4596 : : /* Requires loop versioning with alias checks. */
4597 : 78539 : if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4598 : : {
4599 : : /* FIXME: Make cost depend on complexity of individual check. */
4600 : 3972 : unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4601 : 3972 : (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4602 : 3972 : len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4603 : 0 : if (len)
4604 : : /* Count LEN - 1 ANDs and LEN comparisons. */
4605 : 0 : (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4606 : : scalar_stmt, vect_prologue);
4607 : 3972 : len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4608 : 1102 : if (len)
4609 : : {
4610 : : /* Count LEN - 1 ANDs and LEN comparisons. */
4611 : 1102 : unsigned int nstmts = len * 2 - 1;
4612 : : /* +1 for each bias that needs adding. */
4613 : 2204 : for (unsigned int i = 0; i < len; ++i)
4614 : 1102 : if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4615 : 119 : nstmts += 1;
4616 : 1102 : (void) add_stmt_cost (target_cost_data, nstmts,
4617 : : scalar_stmt, vect_prologue);
4618 : : }
4619 : 3972 : if (dump_enabled_p ())
4620 : 14 : dump_printf (MSG_NOTE,
4621 : : "cost model: Adding cost of checks for loop "
4622 : : "versioning aliasing.\n");
4623 : : }
4624 : :
4625 : : /* Requires loop versioning with niter checks. */
4626 : 78539 : if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4627 : : {
4628 : : /* FIXME: Make cost depend on complexity of individual check. */
4629 : 671 : (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4630 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4631 : 671 : if (dump_enabled_p ())
4632 : 1 : dump_printf (MSG_NOTE,
4633 : : "cost model: Adding cost of checks for loop "
4634 : : "versioning niters.\n");
4635 : : }
4636 : :
4637 : 78539 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4638 : 4643 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4639 : : vect_prologue);
4640 : :
4641 : : /* Count statements in scalar loop. Using this as scalar cost for a single
4642 : : iteration for now.
4643 : :
4644 : : TODO: Add outer loop support.
4645 : :
4646 : : TODO: Consider assigning different costs to different scalar
4647 : : statements. */
4648 : :
4649 : 78539 : scalar_single_iter_cost = (loop_vinfo->scalar_costs->total_cost ()
4650 : 78539 : * param_vect_scalar_cost_multiplier) / 100;
4651 : :
4652 : : /* Add additional cost for the peeled instructions in prologue and epilogue
4653 : : loop. (For fully-masked loops there will be no peeling.)
4654 : :
4655 : : FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4656 : : at compile-time - we assume it's vf/2 (the worst would be vf-1).
4657 : :
4658 : : TODO: Build an expression that represents peel_iters for prologue and
4659 : : epilogue to be used in a run-time test. */
4660 : :
4661 : 78539 : bool prologue_need_br_taken_cost = false;
4662 : 78539 : bool prologue_need_br_not_taken_cost = false;
4663 : :
4664 : : /* Calculate peel_iters_prologue. */
4665 : 78539 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4666 : : peel_iters_prologue = 0;
4667 : 78539 : else if (npeel < 0)
4668 : : {
4669 : 107 : peel_iters_prologue = assumed_vf / 2;
4670 : 107 : if (dump_enabled_p ())
4671 : 4 : dump_printf (MSG_NOTE, "cost model: "
4672 : : "prologue peel iters set to vf/2.\n");
4673 : :
4674 : : /* If peeled iterations are unknown, count a taken branch and a not taken
4675 : : branch per peeled loop. Even if scalar loop iterations are known,
4676 : : vector iterations are not known since peeled prologue iterations are
4677 : : not known. Hence guards remain the same. */
4678 : : prologue_need_br_taken_cost = true;
4679 : : prologue_need_br_not_taken_cost = true;
4680 : : }
4681 : : else
4682 : : {
4683 : 78432 : peel_iters_prologue = npeel;
4684 : 78432 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4685 : : /* If peeled iterations are known but number of scalar loop
4686 : : iterations are unknown, count a taken branch per peeled loop. */
4687 : 78539 : prologue_need_br_taken_cost = true;
4688 : : }
4689 : :
4690 : 78539 : bool epilogue_need_br_taken_cost = false;
4691 : 78539 : bool epilogue_need_br_not_taken_cost = false;
4692 : :
4693 : : /* Calculate peel_iters_epilogue. */
4694 : 78539 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4695 : : /* We need to peel exactly one iteration for gaps. */
4696 : 15 : peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4697 : 78524 : else if (npeel < 0)
4698 : : {
4699 : : /* If peeling for alignment is unknown, loop bound of main loop
4700 : : becomes unknown. */
4701 : 107 : peel_iters_epilogue = assumed_vf / 2;
4702 : 107 : if (dump_enabled_p ())
4703 : 4 : dump_printf (MSG_NOTE, "cost model: "
4704 : : "epilogue peel iters set to vf/2 because "
4705 : : "peeling for alignment is unknown.\n");
4706 : :
4707 : : /* See the same reason above in peel_iters_prologue calculation. */
4708 : : epilogue_need_br_taken_cost = true;
4709 : : epilogue_need_br_not_taken_cost = true;
4710 : : }
4711 : : else
4712 : : {
4713 : 78417 : peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4714 : 78417 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4715 : : /* If peeled iterations are known but number of scalar loop
4716 : : iterations are unknown, count a taken branch per peeled loop. */
4717 : 78539 : epilogue_need_br_taken_cost = true;
4718 : : }
4719 : :
4720 : 78539 : stmt_info_for_cost *si;
4721 : 78539 : int j;
4722 : : /* Add costs associated with peel_iters_prologue. */
4723 : 78539 : if (peel_iters_prologue)
4724 : 496 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4725 : : {
4726 : 380 : (void) add_stmt_cost (target_cost_data,
4727 : 380 : si->count * peel_iters_prologue, si->kind,
4728 : : si->stmt_info, si->node, si->vectype,
4729 : : si->misalign, vect_prologue);
4730 : : }
4731 : :
4732 : : /* Add costs associated with peel_iters_epilogue. */
4733 : 78539 : if (peel_iters_epilogue)
4734 : 267534 : FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4735 : : {
4736 : 212428 : (void) add_stmt_cost (target_cost_data,
4737 : 212428 : si->count * peel_iters_epilogue, si->kind,
4738 : : si->stmt_info, si->node, si->vectype,
4739 : : si->misalign, vect_epilogue);
4740 : : }
4741 : :
4742 : : /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4743 : :
4744 : 78539 : if (prologue_need_br_taken_cost)
4745 : 108 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4746 : : vect_prologue);
4747 : :
4748 : 78539 : if (prologue_need_br_not_taken_cost)
4749 : 107 : (void) add_stmt_cost (target_cost_data, 1,
4750 : : cond_branch_not_taken, vect_prologue);
4751 : :
4752 : 78539 : if (epilogue_need_br_taken_cost)
4753 : 46157 : (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4754 : : vect_epilogue);
4755 : :
4756 : 78539 : if (epilogue_need_br_not_taken_cost)
4757 : 107 : (void) add_stmt_cost (target_cost_data, 1,
4758 : : cond_branch_not_taken, vect_epilogue);
4759 : :
4760 : : /* Take care of special costs for rgroup controls of partial vectors. */
4761 : 15 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4762 : 78554 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4763 : : == vect_partial_vectors_avx512))
4764 : : {
4765 : : /* Calculate how many masks we need to generate. */
4766 : 15 : unsigned int num_masks = 0;
4767 : 15 : bool need_saturation = false;
4768 : 62 : for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4769 : 17 : if (rgm.type)
4770 : : {
4771 : 15 : unsigned nvectors = rgm.factor;
4772 : 15 : num_masks += nvectors;
4773 : 15 : if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4774 : 15 : < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4775 : 4 : need_saturation = true;
4776 : : }
4777 : :
4778 : : /* ??? The target isn't able to identify the costs below as
4779 : : producing masks so it cannot penaltize cases where we'd run
4780 : : out of mask registers for example. */
4781 : :
4782 : : /* ??? We are also failing to account for smaller vector masks
4783 : : we generate by splitting larger masks in vect_get_loop_mask. */
4784 : :
4785 : : /* In the worst case, we need to generate each mask in the prologue
4786 : : and in the loop body. We need one splat per group and one
4787 : : compare per mask.
4788 : :
4789 : : Sometimes the prologue mask will fold to a constant,
4790 : : so the actual prologue cost might be smaller. However, it's
4791 : : simpler and safer to use the worst-case cost; if this ends up
4792 : : being the tie-breaker between vectorizing or not, then it's
4793 : : probably better not to vectorize. */
4794 : 15 : (void) add_stmt_cost (target_cost_data,
4795 : : num_masks
4796 : 15 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4797 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4798 : : vect_prologue);
4799 : 30 : (void) add_stmt_cost (target_cost_data,
4800 : : num_masks
4801 : 30 : + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4802 : : vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4803 : :
4804 : : /* When we need saturation we need it both in the prologue and
4805 : : the epilogue. */
4806 : 15 : if (need_saturation)
4807 : : {
4808 : 4 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4809 : : NULL, NULL, NULL_TREE, 0, vect_prologue);
4810 : 4 : (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4811 : : NULL, NULL, NULL_TREE, 0, vect_body);
4812 : : }
4813 : : }
4814 : 0 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4815 : 78524 : && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4816 : : == vect_partial_vectors_while_ult))
4817 : : {
4818 : : /* Calculate how many masks we need to generate. */
4819 : : unsigned int num_masks = 0;
4820 : : rgroup_controls *rgm;
4821 : : unsigned int num_vectors_m1;
4822 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4823 : : num_vectors_m1, rgm)
4824 : 0 : if (rgm->type)
4825 : 0 : num_masks += num_vectors_m1 + 1;
4826 : 0 : gcc_assert (num_masks > 0);
4827 : :
4828 : : /* In the worst case, we need to generate each mask in the prologue
4829 : : and in the loop body. One of the loop body mask instructions
4830 : : replaces the comparison in the scalar loop, and since we don't
4831 : : count the scalar comparison against the scalar body, we shouldn't
4832 : : count that vector instruction against the vector body either.
4833 : :
4834 : : Sometimes we can use unpacks instead of generating prologue
4835 : : masks and sometimes the prologue mask will fold to a constant,
4836 : : so the actual prologue cost might be smaller. However, it's
4837 : : simpler and safer to use the worst-case cost; if this ends up
4838 : : being the tie-breaker between vectorizing or not, then it's
4839 : : probably better not to vectorize. */
4840 : 0 : (void) add_stmt_cost (target_cost_data, num_masks,
4841 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4842 : : vect_prologue);
4843 : 0 : (void) add_stmt_cost (target_cost_data, num_masks - 1,
4844 : : vector_stmt, NULL, NULL, NULL_TREE, 0,
4845 : : vect_body);
4846 : : }
4847 : 78524 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4848 : : {
4849 : : /* Referring to the functions vect_set_loop_condition_partial_vectors
4850 : : and vect_set_loop_controls_directly, we need to generate each
4851 : : length in the prologue and in the loop body if required. Although
4852 : : there are some possible optimizations, we consider the worst case
4853 : : here. */
4854 : :
4855 : 0 : bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4856 : 0 : signed char partial_load_store_bias
4857 : : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4858 : 0 : bool need_iterate_p
4859 : 0 : = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4860 : 0 : && !vect_known_niters_smaller_than_vf (loop_vinfo));
4861 : :
4862 : : /* Calculate how many statements to be added. */
4863 : 0 : unsigned int prologue_stmts = 0;
4864 : 0 : unsigned int body_stmts = 0;
4865 : :
4866 : 0 : rgroup_controls *rgc;
4867 : 0 : unsigned int num_vectors_m1;
4868 : 0 : FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4869 : 0 : if (rgc->type)
4870 : : {
4871 : : /* May need one SHIFT for nitems_total computation. */
4872 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4873 : 0 : if (nitems != 1 && !niters_known_p)
4874 : 0 : prologue_stmts += 1;
4875 : :
4876 : : /* May need one MAX and one MINUS for wrap around. */
4877 : 0 : if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4878 : 0 : prologue_stmts += 2;
4879 : :
4880 : : /* Need one MAX and one MINUS for each batch limit excepting for
4881 : : the 1st one. */
4882 : 0 : prologue_stmts += num_vectors_m1 * 2;
4883 : :
4884 : 0 : unsigned int num_vectors = num_vectors_m1 + 1;
4885 : :
4886 : : /* Need to set up lengths in prologue, only one MIN required
4887 : : for each since start index is zero. */
4888 : 0 : prologue_stmts += num_vectors;
4889 : :
4890 : : /* If we have a non-zero partial load bias, we need one PLUS
4891 : : to adjust the load length. */
4892 : 0 : if (partial_load_store_bias != 0)
4893 : 0 : body_stmts += 1;
4894 : :
4895 : 0 : unsigned int length_update_cost = 0;
4896 : 0 : if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4897 : : /* For decrement IV style, Each only need a single SELECT_VL
4898 : : or MIN since beginning to calculate the number of elements
4899 : : need to be processed in current iteration. */
4900 : : length_update_cost = 1;
4901 : : else
4902 : : /* For increment IV stype, Each may need two MINs and one MINUS to
4903 : : update lengths in body for next iteration. */
4904 : 0 : length_update_cost = 3;
4905 : :
4906 : 0 : if (need_iterate_p)
4907 : 0 : body_stmts += length_update_cost * num_vectors;
4908 : : }
4909 : :
4910 : 0 : (void) add_stmt_cost (target_cost_data, prologue_stmts,
4911 : : scalar_stmt, vect_prologue);
4912 : 0 : (void) add_stmt_cost (target_cost_data, body_stmts,
4913 : : scalar_stmt, vect_body);
4914 : : }
4915 : :
4916 : : /* FORNOW: The scalar outside cost is incremented in one of the
4917 : : following ways:
4918 : :
4919 : : 1. The vectorizer checks for alignment and aliasing and generates
4920 : : a condition that allows dynamic vectorization. A cost model
4921 : : check is ANDED with the versioning condition. Hence scalar code
4922 : : path now has the added cost of the versioning check.
4923 : :
4924 : : if (cost > th & versioning_check)
4925 : : jmp to vector code
4926 : :
4927 : : Hence run-time scalar is incremented by not-taken branch cost.
4928 : :
4929 : : 2. The vectorizer then checks if a prologue is required. If the
4930 : : cost model check was not done before during versioning, it has to
4931 : : be done before the prologue check.
4932 : :
4933 : : if (cost <= th)
4934 : : prologue = scalar_iters
4935 : : if (prologue == 0)
4936 : : jmp to vector code
4937 : : else
4938 : : execute prologue
4939 : : if (prologue == num_iters)
4940 : : go to exit
4941 : :
4942 : : Hence the run-time scalar cost is incremented by a taken branch,
4943 : : plus a not-taken branch, plus a taken branch cost.
4944 : :
4945 : : 3. The vectorizer then checks if an epilogue is required. If the
4946 : : cost model check was not done before during prologue check, it
4947 : : has to be done with the epilogue check.
4948 : :
4949 : : if (prologue == 0)
4950 : : jmp to vector code
4951 : : else
4952 : : execute prologue
4953 : : if (prologue == num_iters)
4954 : : go to exit
4955 : : vector code:
4956 : : if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4957 : : jmp to epilogue
4958 : :
4959 : : Hence the run-time scalar cost should be incremented by 2 taken
4960 : : branches.
4961 : :
4962 : : TODO: The back end may reorder the BBS's differently and reverse
4963 : : conditions/branch directions. Change the estimates below to
4964 : : something more reasonable. */
4965 : :
4966 : : /* If the number of iterations is known and we do not do versioning, we can
4967 : : decide whether to vectorize at compile time. Hence the scalar version
4968 : : do not carry cost model guard costs. */
4969 : 31846 : if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4970 : 110385 : || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4971 : : {
4972 : : /* Cost model check occurs at versioning. */
4973 : 47278 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4974 : 4643 : scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4975 : : else
4976 : : {
4977 : : /* Cost model check occurs at prologue generation. */
4978 : 42635 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4979 : 26 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4980 : 26 : + vect_get_stmt_cost (cond_branch_not_taken);
4981 : : /* Cost model check occurs at epilogue generation. */
4982 : : else
4983 : 42609 : scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4984 : : }
4985 : : }
4986 : :
4987 : : /* Complete the target-specific cost calculations. */
4988 : 78539 : loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
4989 : 78539 : vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
4990 : 78539 : vec_inside_cost = loop_vinfo->vector_costs->body_cost ();
4991 : 78539 : vec_epilogue_cost = loop_vinfo->vector_costs->epilogue_cost ();
4992 : 78539 : if (suggested_unroll_factor)
4993 : 78539 : *suggested_unroll_factor
4994 : 78539 : = loop_vinfo->vector_costs->suggested_unroll_factor ();
4995 : :
4996 : 78539 : if (suggested_unroll_factor && *suggested_unroll_factor > 1
4997 : 0 : && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4998 : 0 : && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4999 : : *suggested_unroll_factor,
5000 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5001 : : {
5002 : 0 : if (dump_enabled_p ())
5003 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5004 : : "can't unroll as unrolled vectorization factor larger"
5005 : : " than maximum vectorization factor: "
5006 : : HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5007 : : LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5008 : 0 : *suggested_unroll_factor = 1;
5009 : : }
5010 : :
5011 : 78539 : vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5012 : :
5013 : 78539 : if (dump_enabled_p ())
5014 : : {
5015 : 604 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5016 : 604 : dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
5017 : : vec_inside_cost);
5018 : 604 : dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
5019 : : vec_prologue_cost);
5020 : 604 : dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
5021 : : vec_epilogue_cost);
5022 : 604 : dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
5023 : : scalar_single_iter_cost);
5024 : 604 : dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
5025 : : scalar_outside_cost);
5026 : 604 : dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
5027 : : vec_outside_cost);
5028 : 604 : dump_printf (MSG_NOTE, " prologue iterations: %d\n",
5029 : : peel_iters_prologue);
5030 : 604 : dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
5031 : : peel_iters_epilogue);
5032 : : }
5033 : :
5034 : : /* Calculate number of iterations required to make the vector version
5035 : : profitable, relative to the loop bodies only. The following condition
5036 : : must hold true:
5037 : : SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5038 : : where
5039 : : SIC = scalar iteration cost, VIC = vector iteration cost,
5040 : : VOC = vector outside cost, VF = vectorization factor,
5041 : : NPEEL = prologue iterations + epilogue iterations,
5042 : : SOC = scalar outside cost for run time cost model check. */
5043 : :
5044 : 78539 : int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5045 : 78539 : - vec_inside_cost);
5046 : 78539 : if (saving_per_viter <= 0)
5047 : : {
5048 : 25019 : if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5049 : 0 : warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5050 : : "vectorization did not happen for a simd loop");
5051 : :
5052 : 25019 : if (dump_enabled_p ())
5053 : 18 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5054 : : "cost model: the vector iteration cost = %d "
5055 : : "divided by the scalar iteration cost = %d "
5056 : : "is greater or equal to the vectorization factor = %d"
5057 : : ".\n",
5058 : : vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5059 : 25019 : *ret_min_profitable_niters = -1;
5060 : 25019 : *ret_min_profitable_estimate = -1;
5061 : 25019 : return;
5062 : : }
5063 : :
5064 : : /* ??? The "if" arm is written to handle all cases; see below for what
5065 : : we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5066 : 53520 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5067 : : {
5068 : : /* Rewriting the condition above in terms of the number of
5069 : : vector iterations (vniters) rather than the number of
5070 : : scalar iterations (niters) gives:
5071 : :
5072 : : SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5073 : :
5074 : : <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5075 : :
5076 : : For integer N, X and Y when X > 0:
5077 : :
5078 : : N * X > Y <==> N >= (Y /[floor] X) + 1. */
5079 : 9 : int outside_overhead = (vec_outside_cost
5080 : 9 : - scalar_single_iter_cost * peel_iters_prologue
5081 : 9 : - scalar_single_iter_cost * peel_iters_epilogue
5082 : : - scalar_outside_cost);
5083 : : /* We're only interested in cases that require at least one
5084 : : vector iteration. */
5085 : 9 : int min_vec_niters = 1;
5086 : 9 : if (outside_overhead > 0)
5087 : 7 : min_vec_niters = outside_overhead / saving_per_viter + 1;
5088 : :
5089 : 9 : if (dump_enabled_p ())
5090 : 2 : dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5091 : : min_vec_niters);
5092 : :
5093 : 9 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5094 : : {
5095 : : /* Now that we know the minimum number of vector iterations,
5096 : : find the minimum niters for which the scalar cost is larger:
5097 : :
5098 : : SIC * niters > VIC * vniters + VOC - SOC
5099 : :
5100 : : We know that the minimum niters is no more than
5101 : : vniters * VF + NPEEL, but it might be (and often is) less
5102 : : than that if a partial vector iteration is cheaper than the
5103 : : equivalent scalar code. */
5104 : 9 : int threshold = (vec_inside_cost * min_vec_niters
5105 : 9 : + vec_outside_cost
5106 : 9 : - scalar_outside_cost);
5107 : 9 : if (threshold <= 0)
5108 : : min_profitable_iters = 1;
5109 : : else
5110 : 9 : min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5111 : : }
5112 : : else
5113 : : /* Convert the number of vector iterations into a number of
5114 : : scalar iterations. */
5115 : 0 : min_profitable_iters = (min_vec_niters * assumed_vf
5116 : 0 : + peel_iters_prologue
5117 : : + peel_iters_epilogue);
5118 : : }
5119 : : else
5120 : : {
5121 : 53511 : min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5122 : 53511 : * assumed_vf
5123 : 53511 : - vec_inside_cost * peel_iters_prologue
5124 : 53511 : - vec_inside_cost * peel_iters_epilogue);
5125 : 53511 : if (min_profitable_iters <= 0)
5126 : : min_profitable_iters = 0;
5127 : : else
5128 : : {
5129 : 44725 : min_profitable_iters /= saving_per_viter;
5130 : :
5131 : 44725 : if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5132 : 44725 : <= (((int) vec_inside_cost * min_profitable_iters)
5133 : 44725 : + (((int) vec_outside_cost - scalar_outside_cost)
5134 : : * assumed_vf)))
5135 : 44725 : min_profitable_iters++;
5136 : : }
5137 : : }
5138 : :
5139 : 53520 : if (dump_enabled_p ())
5140 : 586 : dump_printf (MSG_NOTE,
5141 : : " Calculated minimum iters for profitability: %d\n",
5142 : : min_profitable_iters);
5143 : :
5144 : 53520 : if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5145 : 53511 : && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5146 : : /* We want the vectorized loop to execute at least once. */
5147 : : min_profitable_iters = assumed_vf + peel_iters_prologue;
5148 : 9711 : else if (min_profitable_iters < peel_iters_prologue)
5149 : : /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5150 : : vectorized loop executes at least once. */
5151 : : min_profitable_iters = peel_iters_prologue;
5152 : :
5153 : 53520 : if (dump_enabled_p ())
5154 : 586 : dump_printf_loc (MSG_NOTE, vect_location,
5155 : : " Runtime profitability threshold = %d\n",
5156 : : min_profitable_iters);
5157 : :
5158 : 53520 : *ret_min_profitable_niters = min_profitable_iters;
5159 : :
5160 : : /* Calculate number of iterations required to make the vector version
5161 : : profitable, relative to the loop bodies only.
5162 : :
5163 : : Non-vectorized variant is SIC * niters and it must win over vector
5164 : : variant on the expected loop trip count. The following condition must hold true:
5165 : : SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5166 : :
5167 : 53520 : if (vec_outside_cost <= 0)
5168 : : min_profitable_estimate = 0;
5169 : : /* ??? This "else if" arm is written to handle all cases; see below for
5170 : : what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5171 : 48292 : else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5172 : : {
5173 : : /* This is a repeat of the code above, but with + SOC rather
5174 : : than - SOC. */
5175 : 9 : int outside_overhead = (vec_outside_cost
5176 : 9 : - scalar_single_iter_cost * peel_iters_prologue
5177 : 9 : - scalar_single_iter_cost * peel_iters_epilogue
5178 : : + scalar_outside_cost);
5179 : 9 : int min_vec_niters = 1;
5180 : 9 : if (outside_overhead > 0)
5181 : 9 : min_vec_niters = outside_overhead / saving_per_viter + 1;
5182 : :
5183 : 9 : if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5184 : : {
5185 : 9 : int threshold = (vec_inside_cost * min_vec_niters
5186 : 9 : + vec_outside_cost
5187 : 9 : + scalar_outside_cost);
5188 : 9 : min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5189 : : }
5190 : : else
5191 : : min_profitable_estimate = (min_vec_niters * assumed_vf
5192 : : + peel_iters_prologue
5193 : : + peel_iters_epilogue);
5194 : : }
5195 : : else
5196 : : {
5197 : 48283 : min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5198 : 48283 : * assumed_vf
5199 : 48283 : - vec_inside_cost * peel_iters_prologue
5200 : 48283 : - vec_inside_cost * peel_iters_epilogue)
5201 : 48283 : / ((scalar_single_iter_cost * assumed_vf)
5202 : : - vec_inside_cost);
5203 : : }
5204 : 53520 : min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5205 : 53520 : if (dump_enabled_p ())
5206 : 586 : dump_printf_loc (MSG_NOTE, vect_location,
5207 : : " Static estimate profitability threshold = %d\n",
5208 : : min_profitable_estimate);
5209 : :
5210 : 53520 : *ret_min_profitable_estimate = min_profitable_estimate;
5211 : : }
5212 : :
5213 : : /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5214 : : vector elements (not bits) for a vector with NELT elements. */
5215 : : static void
5216 : 2024 : calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5217 : : vec_perm_builder *sel)
5218 : : {
5219 : : /* The encoding is a single stepped pattern. Any wrap-around is handled
5220 : : by vec_perm_indices. */
5221 : 2024 : sel->new_vector (nelt, 1, 3);
5222 : 8096 : for (unsigned int i = 0; i < 3; i++)
5223 : 6072 : sel->quick_push (i + offset);
5224 : 2024 : }
5225 : :
5226 : : /* Checks whether the target supports whole-vector shifts for vectors of mode
5227 : : MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5228 : : it supports vec_perm_const with masks for all necessary shift amounts. */
5229 : : static bool
5230 : 6880 : have_whole_vector_shift (machine_mode mode)
5231 : : {
5232 : 6880 : if (can_implement_p (vec_shr_optab, mode))
5233 : : return true;
5234 : :
5235 : : /* Variable-length vectors should be handled via the optab. */
5236 : 56 : unsigned int nelt;
5237 : 112 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5238 : : return false;
5239 : :
5240 : 56 : vec_perm_builder sel;
5241 : 56 : vec_perm_indices indices;
5242 : 289 : for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5243 : : {
5244 : 233 : calc_vec_perm_mask_for_shift (i, nelt, &sel);
5245 : 233 : indices.new_vector (sel, 2, nelt);
5246 : 233 : if (!can_vec_perm_const_p (mode, mode, indices, false))
5247 : : return false;
5248 : : }
5249 : : return true;
5250 : 56 : }
5251 : :
5252 : : /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5253 : : multiplication operands have differing signs and (b) we intend
5254 : : to emulate the operation using a series of signed DOT_PROD_EXPRs.
5255 : : See vect_emulate_mixed_dot_prod for the actual sequence used. */
5256 : :
5257 : : static bool
5258 : 1875 : vect_is_emulated_mixed_dot_prod (stmt_vec_info stmt_info)
5259 : : {
5260 : 1875 : gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5261 : 1607 : if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5262 : : return false;
5263 : :
5264 : 524 : tree rhs1 = gimple_assign_rhs1 (assign);
5265 : 524 : tree rhs2 = gimple_assign_rhs2 (assign);
5266 : 524 : if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5267 : : return false;
5268 : :
5269 : 120 : gcc_assert (STMT_VINFO_REDUC_VECTYPE_IN (stmt_info));
5270 : 120 : return !directly_supported_p (DOT_PROD_EXPR,
5271 : : STMT_VINFO_VECTYPE (stmt_info),
5272 : : STMT_VINFO_REDUC_VECTYPE_IN (stmt_info),
5273 : 120 : optab_vector_mixed_sign);
5274 : : }
5275 : :
5276 : : /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5277 : : functions. Design better to avoid maintenance issues. */
5278 : :
5279 : : /* Function vect_model_reduction_cost.
5280 : :
5281 : : Models cost for a reduction operation, including the vector ops
5282 : : generated within the strip-mine loop in some cases, the initial
5283 : : definition before the loop, and the epilogue code that must be generated. */
5284 : :
5285 : : static void
5286 : 51006 : vect_model_reduction_cost (loop_vec_info loop_vinfo,
5287 : : stmt_vec_info stmt_info, internal_fn reduc_fn,
5288 : : vect_reduction_type reduction_type,
5289 : : int ncopies, stmt_vector_for_cost *cost_vec)
5290 : : {
5291 : 51006 : int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5292 : 51006 : tree vectype;
5293 : 51006 : machine_mode mode;
5294 : 51006 : class loop *loop = NULL;
5295 : :
5296 : 51006 : if (loop_vinfo)
5297 : 51006 : loop = LOOP_VINFO_LOOP (loop_vinfo);
5298 : :
5299 : : /* Condition reductions generate two reductions in the loop. */
5300 : 51006 : if (reduction_type == COND_REDUCTION)
5301 : 224 : ncopies *= 2;
5302 : :
5303 : 51006 : vectype = STMT_VINFO_VECTYPE (stmt_info);
5304 : 51006 : mode = TYPE_MODE (vectype);
5305 : 51006 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5306 : :
5307 : 51006 : gimple_match_op op;
5308 : 51006 : if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5309 : 0 : gcc_unreachable ();
5310 : :
5311 : 51006 : if (reduction_type == EXTRACT_LAST_REDUCTION)
5312 : : /* No extra instructions are needed in the prologue. The loop body
5313 : : operations are costed in vectorizable_condition. */
5314 : : inside_cost = 0;
5315 : 51006 : else if (reduction_type == FOLD_LEFT_REDUCTION)
5316 : : {
5317 : : /* No extra instructions needed in the prologue. */
5318 : 4462 : prologue_cost = 0;
5319 : :
5320 : 4462 : if (reduc_fn != IFN_LAST)
5321 : : /* Count one reduction-like operation per vector. */
5322 : 0 : inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5323 : : stmt_info, 0, vect_body);
5324 : : else
5325 : : {
5326 : : /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5327 : 4462 : unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5328 : 4462 : inside_cost = record_stmt_cost (cost_vec, nelements,
5329 : : vec_to_scalar, stmt_info, 0,
5330 : : vect_body);
5331 : 4462 : inside_cost += record_stmt_cost (cost_vec, nelements,
5332 : : scalar_stmt, stmt_info, 0,
5333 : : vect_body);
5334 : : }
5335 : : }
5336 : : else
5337 : : {
5338 : : /* Add in the cost of the initial definitions. */
5339 : 46544 : int prologue_stmts;
5340 : 46544 : if (reduction_type == COND_REDUCTION)
5341 : : /* For cond reductions we have four vectors: initial index, step,
5342 : : initial result of the data reduction, initial value of the index
5343 : : reduction. */
5344 : : prologue_stmts = 4;
5345 : : else
5346 : : /* We need the initial reduction value. */
5347 : 46320 : prologue_stmts = 1;
5348 : 46544 : prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5349 : : scalar_to_vec, stmt_info, 0,
5350 : : vect_prologue);
5351 : : }
5352 : :
5353 : : /* Determine cost of epilogue code.
5354 : :
5355 : : We have a reduction operator that will reduce the vector in one statement.
5356 : : Also requires scalar extract. */
5357 : :
5358 : 51006 : if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5359 : : {
5360 : 50857 : if (reduc_fn != IFN_LAST)
5361 : : {
5362 : 37886 : if (reduction_type == COND_REDUCTION)
5363 : : {
5364 : : /* An EQ stmt and an COND_EXPR stmt. */
5365 : 9 : epilogue_cost += record_stmt_cost (cost_vec, 2,
5366 : : vector_stmt, stmt_info, 0,
5367 : : vect_epilogue);
5368 : : /* Reduction of the max index and a reduction of the found
5369 : : values. */
5370 : 9 : epilogue_cost += record_stmt_cost (cost_vec, 2,
5371 : : vec_to_scalar, stmt_info, 0,
5372 : : vect_epilogue);
5373 : : /* A broadcast of the max value. */
5374 : 9 : epilogue_cost += record_stmt_cost (cost_vec, 1,
5375 : : scalar_to_vec, stmt_info, 0,
5376 : : vect_epilogue);
5377 : : }
5378 : : else
5379 : : {
5380 : 37877 : epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5381 : : stmt_info, 0, vect_epilogue);
5382 : 37877 : epilogue_cost += record_stmt_cost (cost_vec, 1,
5383 : : vec_to_scalar, stmt_info, 0,
5384 : : vect_epilogue);
5385 : : }
5386 : : }
5387 : 12971 : else if (reduction_type == COND_REDUCTION)
5388 : : {
5389 : 215 : unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5390 : : /* Extraction of scalar elements. */
5391 : 430 : epilogue_cost += record_stmt_cost (cost_vec,
5392 : 215 : 2 * estimated_nunits,
5393 : : vec_to_scalar, stmt_info, 0,
5394 : : vect_epilogue);
5395 : : /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5396 : 215 : epilogue_cost += record_stmt_cost (cost_vec,
5397 : 215 : 2 * estimated_nunits - 3,
5398 : : scalar_stmt, stmt_info, 0,
5399 : : vect_epilogue);
5400 : : }
5401 : 12756 : else if (reduction_type == EXTRACT_LAST_REDUCTION
5402 : 12756 : || reduction_type == FOLD_LEFT_REDUCTION)
5403 : : /* No extra instructions need in the epilogue. */
5404 : : ;
5405 : : else
5406 : : {
5407 : 8294 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5408 : 8294 : tree bitsize = TYPE_SIZE (op.type);
5409 : 8294 : int element_bitsize = tree_to_uhwi (bitsize);
5410 : 8294 : int nelements = vec_size_in_bits / element_bitsize;
5411 : :
5412 : 8294 : if (op.code == COND_EXPR)
5413 : 28 : op.code = MAX_EXPR;
5414 : :
5415 : : /* We have a whole vector shift available. */
5416 : 765 : if (VECTOR_MODE_P (mode)
5417 : 8294 : && directly_supported_p (op.code, vectype)
5418 : 13466 : && have_whole_vector_shift (mode))
5419 : : {
5420 : : /* Final reduction via vector shifts and the reduction operator.
5421 : : Also requires scalar extract. */
5422 : 15516 : epilogue_cost += record_stmt_cost (cost_vec,
5423 : 10344 : exact_log2 (nelements) * 2,
5424 : : vector_stmt, stmt_info, 0,
5425 : : vect_epilogue);
5426 : 5172 : epilogue_cost += record_stmt_cost (cost_vec, 1,
5427 : : vec_to_scalar, stmt_info, 0,
5428 : : vect_epilogue);
5429 : : }
5430 : : else
5431 : : /* Use extracts and reduction op for final reduction. For N
5432 : : elements, we have N extracts and N-1 reduction ops. */
5433 : 3122 : epilogue_cost += record_stmt_cost (cost_vec,
5434 : 3122 : nelements + nelements - 1,
5435 : : vector_stmt, stmt_info, 0,
5436 : : vect_epilogue);
5437 : : }
5438 : : }
5439 : :
5440 : 51006 : if (dump_enabled_p ())
5441 : 2999 : dump_printf (MSG_NOTE,
5442 : : "vect_model_reduction_cost: inside_cost = %d, "
5443 : : "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5444 : : prologue_cost, epilogue_cost);
5445 : 51006 : }
5446 : :
5447 : : /* SEQ is a sequence of instructions that initialize the reduction
5448 : : described by REDUC_INFO. Emit them in the appropriate place. */
5449 : :
5450 : : static void
5451 : 397 : vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5452 : : stmt_vec_info reduc_info, gimple *seq)
5453 : : {
5454 : 397 : if (reduc_info->reused_accumulator)
5455 : : {
5456 : : /* When reusing an accumulator from the main loop, we only need
5457 : : initialization instructions if the main loop can be skipped.
5458 : : In that case, emit the initialization instructions at the end
5459 : : of the guard block that does the skip. */
5460 : 21 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5461 : 21 : gcc_assert (skip_edge);
5462 : 21 : gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5463 : 21 : gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5464 : : }
5465 : : else
5466 : : {
5467 : : /* The normal case: emit the initialization instructions on the
5468 : : preheader edge. */
5469 : 376 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5470 : 376 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5471 : : }
5472 : 397 : }
5473 : :
5474 : : /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5475 : : which performs a reduction involving GROUP_SIZE scalar statements.
5476 : : NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5477 : : is nonnull, introducing extra elements of that value will not change the
5478 : : result. */
5479 : :
5480 : : static void
5481 : 20861 : get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5482 : : stmt_vec_info reduc_info,
5483 : : vec<tree> *vec_oprnds,
5484 : : unsigned int number_of_vectors,
5485 : : unsigned int group_size, tree neutral_op)
5486 : : {
5487 : 20861 : vec<tree> &initial_values = reduc_info->reduc_initial_values;
5488 : 20861 : unsigned HOST_WIDE_INT nunits;
5489 : 20861 : unsigned j, number_of_places_left_in_vector;
5490 : 20861 : tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5491 : 20861 : unsigned int i;
5492 : :
5493 : 41722 : gcc_assert (group_size == initial_values.length () || neutral_op);
5494 : :
5495 : : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5496 : : created vectors. It is greater than 1 if unrolling is performed.
5497 : :
5498 : : For example, we have two scalar operands, s1 and s2 (e.g., group of
5499 : : strided accesses of size two), while NUNITS is four (i.e., four scalars
5500 : : of this type can be packed in a vector). The output vector will contain
5501 : : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5502 : : will be 2).
5503 : :
5504 : : If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5505 : : vectors containing the operands.
5506 : :
5507 : : For example, NUNITS is four as before, and the group size is 8
5508 : : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5509 : : {s5, s6, s7, s8}. */
5510 : :
5511 : 20861 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5512 : : nunits = group_size;
5513 : :
5514 : 20861 : number_of_places_left_in_vector = nunits;
5515 : 20861 : bool constant_p = true;
5516 : 20861 : tree_vector_builder elts (vector_type, nunits, 1);
5517 : 20861 : elts.quick_grow (nunits);
5518 : 20861 : gimple_seq ctor_seq = NULL;
5519 : 20861 : if (neutral_op
5520 : 41635 : && !useless_type_conversion_p (TREE_TYPE (vector_type),
5521 : 20774 : TREE_TYPE (neutral_op)))
5522 : 1 : neutral_op = gimple_convert (&ctor_seq,
5523 : 1 : TREE_TYPE (vector_type),
5524 : : neutral_op);
5525 : 196169 : for (j = 0; j < nunits * number_of_vectors; ++j)
5526 : : {
5527 : 175308 : tree op;
5528 : 175308 : i = j % group_size;
5529 : :
5530 : : /* Get the def before the loop. In reduction chain we have only
5531 : : one initial value. Else we have as many as PHIs in the group. */
5532 : 175308 : if (i >= initial_values.length () || (j > i && neutral_op))
5533 : : op = neutral_op;
5534 : : else
5535 : : {
5536 : 43046 : if (!useless_type_conversion_p (TREE_TYPE (vector_type),
5537 : 21523 : TREE_TYPE (initial_values[i])))
5538 : 6 : initial_values[i] = gimple_convert (&ctor_seq,
5539 : 3 : TREE_TYPE (vector_type),
5540 : 3 : initial_values[i]);
5541 : 21523 : op = initial_values[i];
5542 : : }
5543 : :
5544 : : /* Create 'vect_ = {op0,op1,...,opn}'. */
5545 : 175308 : number_of_places_left_in_vector--;
5546 : 175308 : elts[nunits - number_of_places_left_in_vector - 1] = op;
5547 : 175308 : if (!CONSTANT_CLASS_P (op))
5548 : 2199 : constant_p = false;
5549 : :
5550 : 175308 : if (number_of_places_left_in_vector == 0)
5551 : : {
5552 : 21413 : tree init;
5553 : 42826 : if (constant_p && !neutral_op
5554 : 42775 : ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5555 : 21413 : : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5556 : : /* Build the vector directly from ELTS. */
5557 : 21413 : init = gimple_build_vector (&ctor_seq, &elts);
5558 : 0 : else if (neutral_op)
5559 : : {
5560 : : /* Build a vector of the neutral value and shift the
5561 : : other elements into place. */
5562 : 0 : init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5563 : : neutral_op);
5564 : 0 : int k = nunits;
5565 : 0 : while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
5566 : : k -= 1;
5567 : 0 : while (k > 0)
5568 : : {
5569 : 0 : k -= 1;
5570 : 0 : init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5571 : 0 : vector_type, init, elts[k]);
5572 : : }
5573 : : }
5574 : : else
5575 : : {
5576 : : /* First time round, duplicate ELTS to fill the
5577 : : required number of vectors. */
5578 : 0 : duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5579 : : elts, number_of_vectors, *vec_oprnds);
5580 : 0 : break;
5581 : : }
5582 : 21413 : vec_oprnds->quick_push (init);
5583 : :
5584 : 21413 : number_of_places_left_in_vector = nunits;
5585 : 21413 : elts.new_vector (vector_type, nunits, 1);
5586 : 21413 : elts.quick_grow (nunits);
5587 : 21413 : constant_p = true;
5588 : : }
5589 : : }
5590 : 20861 : if (ctor_seq != NULL)
5591 : 397 : vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5592 : 20861 : }
5593 : :
5594 : : /* For a statement STMT_INFO taking part in a reduction operation return
5595 : : the stmt_vec_info the meta information is stored on. */
5596 : :
5597 : : stmt_vec_info
5598 : 132208 : info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5599 : : {
5600 : 132208 : stmt_info = vect_orig_stmt (stmt_info);
5601 : 132208 : gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5602 : 132208 : if (!is_a <gphi *> (stmt_info->stmt)
5603 : 132208 : || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5604 : 53802 : stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5605 : 132208 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
5606 : 132208 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5607 : : {
5608 : 580 : if (gimple_phi_num_args (phi) == 1)
5609 : 241 : stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5610 : : }
5611 : 131628 : else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5612 : : {
5613 : 2586 : stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5614 : 2586 : if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5615 : 132208 : stmt_info = info;
5616 : : }
5617 : 132208 : return stmt_info;
5618 : : }
5619 : :
5620 : : /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5621 : : REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5622 : : return false. */
5623 : :
5624 : : static bool
5625 : 20869 : vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5626 : : stmt_vec_info reduc_info)
5627 : : {
5628 : 20869 : loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5629 : 20869 : if (!main_loop_vinfo)
5630 : : return false;
5631 : :
5632 : 4720 : if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5633 : : return false;
5634 : :
5635 : 4702 : unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5636 : 4702 : auto_vec<tree, 16> main_loop_results (num_phis);
5637 : 4702 : auto_vec<tree, 16> initial_values (num_phis);
5638 : 4702 : if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5639 : : {
5640 : : /* The epilogue loop can be entered either from the main loop or
5641 : : from an earlier guard block. */
5642 : 4521 : edge skip_edge = loop_vinfo->skip_main_loop_edge;
5643 : 18102 : for (tree incoming_value : reduc_info->reduc_initial_values)
5644 : : {
5645 : : /* Look for:
5646 : :
5647 : : INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5648 : : INITIAL_VALUE(guard block)>. */
5649 : 4539 : gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5650 : :
5651 : 4539 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5652 : 4539 : gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5653 : :
5654 : 4539 : tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5655 : 4539 : tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5656 : :
5657 : 4539 : main_loop_results.quick_push (from_main_loop);
5658 : 4539 : initial_values.quick_push (from_skip);
5659 : : }
5660 : : }
5661 : : else
5662 : : /* The main loop dominates the epilogue loop. */
5663 : 181 : main_loop_results.splice (reduc_info->reduc_initial_values);
5664 : :
5665 : : /* See if the main loop has the kind of accumulator we need. */
5666 : 4702 : vect_reusable_accumulator *accumulator
5667 : 4702 : = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5668 : 4702 : if (!accumulator
5669 : 9390 : || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5670 : 14092 : || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5671 : : accumulator->reduc_info->reduc_scalar_results.begin ()))
5672 : : return false;
5673 : :
5674 : : /* Handle the case where we can reduce wider vectors to narrower ones. */
5675 : 4689 : tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5676 : 4689 : tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5677 : 4689 : unsigned HOST_WIDE_INT m;
5678 : 4689 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5679 : 4689 : TYPE_VECTOR_SUBPARTS (vectype), &m))
5680 : 0 : return false;
5681 : : /* Check the intermediate vector types and operations are available. */
5682 : 4689 : tree prev_vectype = old_vectype;
5683 : 4689 : poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5684 : 13547 : while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5685 : : {
5686 : 4689 : intermediate_nunits = exact_div (intermediate_nunits, 2);
5687 : 4689 : tree intermediate_vectype = get_related_vectype_for_scalar_type
5688 : 4689 : (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5689 : 4689 : if (!intermediate_vectype
5690 : 4689 : || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5691 : : intermediate_vectype)
5692 : 8858 : || !can_vec_extract (TYPE_MODE (prev_vectype),
5693 : 4169 : TYPE_MODE (intermediate_vectype)))
5694 : : return false;
5695 : : prev_vectype = intermediate_vectype;
5696 : : }
5697 : :
5698 : : /* Non-SLP reductions might apply an adjustment after the reduction
5699 : : operation, in order to simplify the initialization of the accumulator.
5700 : : If the epilogue loop carries on from where the main loop left off,
5701 : : it should apply the same adjustment to the final reduction result.
5702 : :
5703 : : If the epilogue loop can also be entered directly (rather than via
5704 : : the main loop), we need to be able to handle that case in the same way,
5705 : : with the same adjustment. (In principle we could add a PHI node
5706 : : to select the correct adjustment, but in practice that shouldn't be
5707 : : necessary.) */
5708 : 4169 : tree main_adjustment
5709 : 4169 : = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5710 : 4169 : if (loop_vinfo->main_loop_edge && main_adjustment)
5711 : : {
5712 : 3603 : gcc_assert (num_phis == 1);
5713 : 3603 : tree initial_value = initial_values[0];
5714 : : /* Check that we can use INITIAL_VALUE as the adjustment and
5715 : : initialize the accumulator with a neutral value instead. */
5716 : 3603 : if (!operand_equal_p (initial_value, main_adjustment))
5717 : 106 : return false;
5718 : 3497 : code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5719 : 3497 : initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5720 : : code, initial_value);
5721 : : }
5722 : 4063 : STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5723 : 4063 : reduc_info->reduc_initial_values.truncate (0);
5724 : 4063 : reduc_info->reduc_initial_values.splice (initial_values);
5725 : 4063 : reduc_info->reused_accumulator = accumulator;
5726 : 4063 : return true;
5727 : 4702 : }
5728 : :
5729 : : /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5730 : : CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5731 : :
5732 : : static tree
5733 : 5769 : vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5734 : : gimple_seq *seq)
5735 : : {
5736 : 5769 : unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5737 : 5769 : unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5738 : 5769 : tree stype = TREE_TYPE (vectype);
5739 : 5769 : tree new_temp = vec_def;
5740 : 9867 : while (nunits > nunits1)
5741 : : {
5742 : 4098 : nunits /= 2;
5743 : 4098 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5744 : 4098 : stype, nunits);
5745 : 4098 : unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5746 : :
5747 : : /* The target has to make sure we support lowpart/highpart
5748 : : extraction, either via direct vector extract or through
5749 : : an integer mode punning. */
5750 : 4098 : tree dst1, dst2;
5751 : 4098 : gimple *epilog_stmt;
5752 : 4098 : if (convert_optab_handler (vec_extract_optab,
5753 : 4098 : TYPE_MODE (TREE_TYPE (new_temp)),
5754 : 4098 : TYPE_MODE (vectype1))
5755 : : != CODE_FOR_nothing)
5756 : : {
5757 : : /* Extract sub-vectors directly once vec_extract becomes
5758 : : a conversion optab. */
5759 : 2665 : dst1 = make_ssa_name (vectype1);
5760 : 2665 : epilog_stmt
5761 : 5330 : = gimple_build_assign (dst1, BIT_FIELD_REF,
5762 : : build3 (BIT_FIELD_REF, vectype1,
5763 : 2665 : new_temp, TYPE_SIZE (vectype1),
5764 : : bitsize_int (0)));
5765 : 2665 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5766 : 2665 : dst2 = make_ssa_name (vectype1);
5767 : 2665 : epilog_stmt
5768 : 2665 : = gimple_build_assign (dst2, BIT_FIELD_REF,
5769 : : build3 (BIT_FIELD_REF, vectype1,
5770 : 2665 : new_temp, TYPE_SIZE (vectype1),
5771 : 2665 : bitsize_int (bitsize)));
5772 : 2665 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5773 : : }
5774 : : else
5775 : : {
5776 : : /* Extract via punning to appropriately sized integer mode
5777 : : vector. */
5778 : 1433 : tree eltype = build_nonstandard_integer_type (bitsize, 1);
5779 : 1433 : tree etype = build_vector_type (eltype, 2);
5780 : 2866 : gcc_assert (convert_optab_handler (vec_extract_optab,
5781 : : TYPE_MODE (etype),
5782 : : TYPE_MODE (eltype))
5783 : : != CODE_FOR_nothing);
5784 : 1433 : tree tem = make_ssa_name (etype);
5785 : 1433 : epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5786 : : build1 (VIEW_CONVERT_EXPR,
5787 : : etype, new_temp));
5788 : 1433 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5789 : 1433 : new_temp = tem;
5790 : 1433 : tem = make_ssa_name (eltype);
5791 : 1433 : epilog_stmt
5792 : 2866 : = gimple_build_assign (tem, BIT_FIELD_REF,
5793 : : build3 (BIT_FIELD_REF, eltype,
5794 : 1433 : new_temp, TYPE_SIZE (eltype),
5795 : : bitsize_int (0)));
5796 : 1433 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5797 : 1433 : dst1 = make_ssa_name (vectype1);
5798 : 1433 : epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5799 : : build1 (VIEW_CONVERT_EXPR,
5800 : : vectype1, tem));
5801 : 1433 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5802 : 1433 : tem = make_ssa_name (eltype);
5803 : 1433 : epilog_stmt
5804 : 1433 : = gimple_build_assign (tem, BIT_FIELD_REF,
5805 : : build3 (BIT_FIELD_REF, eltype,
5806 : 1433 : new_temp, TYPE_SIZE (eltype),
5807 : 1433 : bitsize_int (bitsize)));
5808 : 1433 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5809 : 1433 : dst2 = make_ssa_name (vectype1);
5810 : 1433 : epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5811 : : build1 (VIEW_CONVERT_EXPR,
5812 : : vectype1, tem));
5813 : 1433 : gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5814 : : }
5815 : :
5816 : 4098 : new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5817 : : }
5818 : :
5819 : 5769 : return new_temp;
5820 : : }
5821 : :
5822 : : /* Function vect_create_epilog_for_reduction
5823 : :
5824 : : Create code at the loop-epilog to finalize the result of a reduction
5825 : : computation.
5826 : :
5827 : : STMT_INFO is the scalar reduction stmt that is being vectorized.
5828 : : SLP_NODE is an SLP node containing a group of reduction statements. The
5829 : : first one in this group is STMT_INFO.
5830 : : SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5831 : : REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5832 : : (counting from 0)
5833 : : LOOP_EXIT is the edge to update in the merge block. In the case of a single
5834 : : exit this edge is always the main loop exit.
5835 : :
5836 : : This function:
5837 : : 1. Completes the reduction def-use cycles.
5838 : : 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5839 : : by calling the function specified by REDUC_FN if available, or by
5840 : : other means (whole-vector shifts or a scalar loop).
5841 : : The function also creates a new phi node at the loop exit to preserve
5842 : : loop-closed form, as illustrated below.
5843 : :
5844 : : The flow at the entry to this function:
5845 : :
5846 : : loop:
5847 : : vec_def = phi <vec_init, null> # REDUCTION_PHI
5848 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5849 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5850 : : loop_exit:
5851 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5852 : : use <s_out0>
5853 : : use <s_out0>
5854 : :
5855 : : The above is transformed by this function into:
5856 : :
5857 : : loop:
5858 : : vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5859 : : VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5860 : : s_loop = scalar_stmt # (scalar) STMT_INFO
5861 : : loop_exit:
5862 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5863 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5864 : : v_out2 = reduce <v_out1>
5865 : : s_out3 = extract_field <v_out2, 0>
5866 : : s_out4 = adjust_result <s_out3>
5867 : : use <s_out4>
5868 : : use <s_out4>
5869 : : */
5870 : :
5871 : : static void
5872 : 21180 : vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5873 : : stmt_vec_info stmt_info,
5874 : : slp_tree slp_node,
5875 : : slp_instance slp_node_instance,
5876 : : edge loop_exit)
5877 : : {
5878 : 21180 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5879 : 21180 : gcc_assert (reduc_info->is_reduc_info);
5880 : : /* For double reductions we need to get at the inner loop reduction
5881 : : stmt which has the meta info attached. Our stmt_info is that of the
5882 : : loop-closed PHI of the inner loop which we remember as
5883 : : def for the reduction PHI generation. */
5884 : 21180 : bool double_reduc = false;
5885 : 21180 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5886 : : {
5887 : 66 : double_reduc = true;
5888 : 66 : stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5889 : 66 : (stmt_info->stmt, 0));
5890 : 66 : stmt_info = vect_stmt_to_vectorize (stmt_info);
5891 : : }
5892 : 21180 : code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5893 : 21180 : internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5894 : 21180 : tree vectype;
5895 : 21180 : machine_mode mode;
5896 : 21180 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5897 : 21180 : basic_block exit_bb;
5898 : 21180 : tree scalar_dest;
5899 : 21180 : tree scalar_type;
5900 : 21180 : gimple *new_phi = NULL, *phi = NULL;
5901 : 21180 : gimple_stmt_iterator exit_gsi;
5902 : 21180 : tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5903 : 21180 : gimple *epilog_stmt = NULL;
5904 : 21180 : gimple *exit_phi;
5905 : 21180 : tree bitsize;
5906 : 21180 : tree def;
5907 : 21180 : tree orig_name, scalar_result;
5908 : 21180 : imm_use_iterator imm_iter, phi_imm_iter;
5909 : 21180 : use_operand_p use_p, phi_use_p;
5910 : 21180 : gimple *use_stmt;
5911 : 21180 : auto_vec<tree> reduc_inputs;
5912 : 21180 : int j, i;
5913 : 21180 : vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5914 : 21180 : unsigned int k;
5915 : : /* SLP reduction without reduction chain, e.g.,
5916 : : # a1 = phi <a2, a0>
5917 : : # b1 = phi <b2, b0>
5918 : : a2 = operation (a1)
5919 : : b2 = operation (b1) */
5920 : 21180 : bool slp_reduc
5921 : 21180 : = !REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (reduc_info));
5922 : 21180 : bool direct_slp_reduc;
5923 : 21180 : tree induction_index = NULL_TREE;
5924 : :
5925 : 21180 : unsigned int group_size = SLP_TREE_LANES (slp_node);
5926 : :
5927 : 21180 : if (nested_in_vect_loop_p (loop, stmt_info))
5928 : : {
5929 : 66 : outer_loop = loop;
5930 : 66 : loop = loop->inner;
5931 : 66 : gcc_assert (double_reduc);
5932 : : }
5933 : :
5934 : 21180 : vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5935 : 21180 : gcc_assert (vectype);
5936 : 21180 : mode = TYPE_MODE (vectype);
5937 : :
5938 : 21180 : tree induc_val = NULL_TREE;
5939 : 21180 : tree adjustment_def = NULL;
5940 : : /* Optimize: for induction condition reduction, if we can't use zero
5941 : : for induc_val, use initial_def. */
5942 : 21180 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5943 : 66 : induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5944 : 21114 : else if (double_reduc)
5945 : : ;
5946 : : else
5947 : 21048 : adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5948 : :
5949 : 21180 : stmt_vec_info single_live_out_stmt[] = { stmt_info };
5950 : 21180 : array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5951 : 21180 : if (slp_reduc)
5952 : : /* All statements produce live-out values. */
5953 : 42026 : live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5954 : :
5955 : 21180 : unsigned vec_num
5956 : 21180 : = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5957 : :
5958 : : /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5959 : : which is updated with the current index of the loop for every match of
5960 : : the original loop's cond_expr (VEC_STMT). This results in a vector
5961 : : containing the last time the condition passed for that vector lane.
5962 : : The first match will be a 1 to allow 0 to be used for non-matching
5963 : : indexes. If there are no matches at all then the vector will be all
5964 : : zeroes.
5965 : :
5966 : : PR92772: This algorithm is broken for architectures that support
5967 : : masked vectors, but do not provide fold_extract_last. */
5968 : 21180 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5969 : : {
5970 : 73 : auto_vec<std::pair<tree, bool>, 2> ccompares;
5971 : 73 : slp_tree cond_node = slp_node_instance->root;
5972 : 167 : while (cond_node != slp_node_instance->reduc_phis)
5973 : : {
5974 : 94 : stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
5975 : 94 : int slp_reduc_idx;
5976 : 94 : if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5977 : : {
5978 : 82 : gimple *vec_stmt
5979 : 82 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
5980 : 82 : gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5981 : 82 : ccompares.safe_push
5982 : 82 : (std::make_pair (gimple_assign_rhs1 (vec_stmt),
5983 : 82 : STMT_VINFO_REDUC_IDX (cond_info) == 2));
5984 : : /* ??? We probably want to have REDUC_IDX on the SLP node?
5985 : : We have both three and four children COND_EXPR nodes
5986 : : dependent on whether the comparison is still embedded
5987 : : as GENERIC. So work backwards. */
5988 : 82 : slp_reduc_idx = (SLP_TREE_CHILDREN (cond_node).length () - 3
5989 : 82 : + STMT_VINFO_REDUC_IDX (cond_info));
5990 : : }
5991 : : else
5992 : 12 : slp_reduc_idx = STMT_VINFO_REDUC_IDX (cond_info);
5993 : 94 : cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
5994 : : }
5995 : 73 : gcc_assert (ccompares.length () != 0);
5996 : :
5997 : 73 : tree indx_before_incr, indx_after_incr;
5998 : 73 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5999 : 73 : int scalar_precision
6000 : 73 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6001 : 73 : tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6002 : 73 : tree cr_index_vector_type = get_related_vectype_for_scalar_type
6003 : 73 : (TYPE_MODE (vectype), cr_index_scalar_type,
6004 : : TYPE_VECTOR_SUBPARTS (vectype));
6005 : :
6006 : : /* First we create a simple vector induction variable which starts
6007 : : with the values {1,2,3,...} (SERIES_VECT) and increments by the
6008 : : vector size (STEP). */
6009 : :
6010 : : /* Create a {1,2,3,...} vector. */
6011 : 73 : tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6012 : :
6013 : : /* Create a vector of the step value. */
6014 : 73 : tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6015 : 73 : tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6016 : :
6017 : : /* Create an induction variable. */
6018 : 73 : gimple_stmt_iterator incr_gsi;
6019 : 73 : bool insert_after;
6020 : 73 : vect_iv_increment_position (LOOP_VINFO_IV_EXIT (loop_vinfo),
6021 : : &incr_gsi, &insert_after);
6022 : 73 : create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6023 : : insert_after, &indx_before_incr, &indx_after_incr);
6024 : :
6025 : : /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6026 : : filled with zeros (VEC_ZERO). */
6027 : :
6028 : : /* Create a vector of 0s. */
6029 : 73 : tree zero = build_zero_cst (cr_index_scalar_type);
6030 : 73 : tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6031 : :
6032 : : /* Create a vector phi node. */
6033 : 73 : tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6034 : 73 : new_phi = create_phi_node (new_phi_tree, loop->header);
6035 : 73 : add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6036 : : loop_preheader_edge (loop), UNKNOWN_LOCATION);
6037 : :
6038 : : /* Now take the condition from the loops original cond_exprs
6039 : : and produce a new cond_exprs (INDEX_COND_EXPR) which for
6040 : : every match uses values from the induction variable
6041 : : (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6042 : : (NEW_PHI_TREE).
6043 : : Finally, we update the phi (NEW_PHI_TREE) to take the value of
6044 : : the new cond_expr (INDEX_COND_EXPR). */
6045 : 73 : gimple_seq stmts = NULL;
6046 : 228 : for (int i = ccompares.length () - 1; i != -1; --i)
6047 : : {
6048 : 82 : tree ccompare = ccompares[i].first;
6049 : 82 : if (ccompares[i].second)
6050 : 69 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6051 : : cr_index_vector_type,
6052 : : ccompare,
6053 : : indx_before_incr, new_phi_tree);
6054 : : else
6055 : 13 : new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6056 : : cr_index_vector_type,
6057 : : ccompare,
6058 : : new_phi_tree, indx_before_incr);
6059 : : }
6060 : 73 : gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6061 : :
6062 : : /* Update the phi with the vec cond. */
6063 : 73 : induction_index = new_phi_tree;
6064 : 73 : add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6065 : : loop_latch_edge (loop), UNKNOWN_LOCATION);
6066 : 73 : }
6067 : :
6068 : : /* 2. Create epilog code.
6069 : : The reduction epilog code operates across the elements of the vector
6070 : : of partial results computed by the vectorized loop.
6071 : : The reduction epilog code consists of:
6072 : :
6073 : : step 1: compute the scalar result in a vector (v_out2)
6074 : : step 2: extract the scalar result (s_out3) from the vector (v_out2)
6075 : : step 3: adjust the scalar result (s_out3) if needed.
6076 : :
6077 : : Step 1 can be accomplished using one the following three schemes:
6078 : : (scheme 1) using reduc_fn, if available.
6079 : : (scheme 2) using whole-vector shifts, if available.
6080 : : (scheme 3) using a scalar loop. In this case steps 1+2 above are
6081 : : combined.
6082 : :
6083 : : The overall epilog code looks like this:
6084 : :
6085 : : s_out0 = phi <s_loop> # original EXIT_PHI
6086 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6087 : : v_out2 = reduce <v_out1> # step 1
6088 : : s_out3 = extract_field <v_out2, 0> # step 2
6089 : : s_out4 = adjust_result <s_out3> # step 3
6090 : :
6091 : : (step 3 is optional, and steps 1 and 2 may be combined).
6092 : : Lastly, the uses of s_out0 are replaced by s_out4. */
6093 : :
6094 : :
6095 : : /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6096 : : v_out1 = phi <VECT_DEF>
6097 : : Store them in NEW_PHIS. */
6098 : 21180 : if (double_reduc)
6099 : 66 : loop = outer_loop;
6100 : : /* We need to reduce values in all exits. */
6101 : 21180 : exit_bb = loop_exit->dest;
6102 : 21180 : exit_gsi = gsi_after_labels (exit_bb);
6103 : 21180 : reduc_inputs.create (vec_num);
6104 : 42916 : for (unsigned i = 0; i < vec_num; i++)
6105 : : {
6106 : 21736 : gimple_seq stmts = NULL;
6107 : 21736 : def = vect_get_slp_vect_def (slp_node, i);
6108 : 21736 : tree new_def = copy_ssa_name (def);
6109 : 21736 : phi = create_phi_node (new_def, exit_bb);
6110 : 21736 : if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6111 : 21709 : SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6112 : : else
6113 : : {
6114 : 57 : for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6115 : 30 : SET_PHI_ARG_DEF (phi, k, def);
6116 : : }
6117 : 21736 : new_def = gimple_convert (&stmts, vectype, new_def);
6118 : 21736 : reduc_inputs.quick_push (new_def);
6119 : 21736 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6120 : : }
6121 : :
6122 : : /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6123 : : (i.e. when reduc_fn is not available) and in the final adjustment
6124 : : code (if needed). Also get the original scalar reduction variable as
6125 : : defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6126 : : represents a reduction pattern), the tree-code and scalar-def are
6127 : : taken from the original stmt that the pattern-stmt (STMT) replaces.
6128 : : Otherwise (it is a regular reduction) - the tree-code and scalar-def
6129 : : are taken from STMT. */
6130 : :
6131 : 21180 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6132 : 21180 : if (orig_stmt_info != stmt_info)
6133 : : {
6134 : : /* Reduction pattern */
6135 : 609 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6136 : 609 : gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6137 : : }
6138 : :
6139 : 21180 : scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6140 : 21180 : scalar_type = TREE_TYPE (scalar_dest);
6141 : 21180 : scalar_results.truncate (0);
6142 : 21180 : scalar_results.reserve_exact (group_size);
6143 : 21180 : new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6144 : 21180 : bitsize = TYPE_SIZE (scalar_type);
6145 : :
6146 : : /* True if we should implement SLP_REDUC using native reduction operations
6147 : : instead of scalar operations. */
6148 : 42360 : direct_slp_reduc = (reduc_fn != IFN_LAST
6149 : 21180 : && slp_reduc
6150 : 21180 : && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6151 : :
6152 : : /* In case of reduction chain, e.g.,
6153 : : # a1 = phi <a3, a0>
6154 : : a2 = operation (a1)
6155 : : a3 = operation (a2),
6156 : :
6157 : : we may end up with more than one vector result. Here we reduce them
6158 : : to one vector.
6159 : :
6160 : : The same is true for a SLP reduction, e.g.,
6161 : : # a1 = phi <a2, a0>
6162 : : # b1 = phi <b2, b0>
6163 : : a2 = operation (a1)
6164 : : b2 = operation (a2),
6165 : :
6166 : : where we can end up with more than one vector as well. We can
6167 : : easily accumulate vectors when the number of vector elements is
6168 : : a multiple of the SLP group size.
6169 : :
6170 : : The same is true if we couldn't use a single defuse cycle. */
6171 : 21180 : if (REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (reduc_info))
6172 : : || direct_slp_reduc
6173 : 21180 : || (slp_reduc
6174 : 21013 : && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size)))
6175 : : {
6176 : 21154 : gimple_seq stmts = NULL;
6177 : 21154 : tree single_input = reduc_inputs[0];
6178 : 21665 : for (k = 1; k < reduc_inputs.length (); k++)
6179 : 1022 : single_input = gimple_build (&stmts, code, vectype,
6180 : 511 : single_input, reduc_inputs[k]);
6181 : 21154 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6182 : :
6183 : 21154 : reduc_inputs.truncate (0);
6184 : 21154 : reduc_inputs.safe_push (single_input);
6185 : : }
6186 : :
6187 : 21180 : tree orig_reduc_input = reduc_inputs[0];
6188 : :
6189 : : /* If this loop is an epilogue loop that can be skipped after the
6190 : : main loop, we can only share a reduction operation between the
6191 : : main loop and the epilogue if we put it at the target of the
6192 : : skip edge.
6193 : :
6194 : : We can still reuse accumulators if this check fails. Doing so has
6195 : : the minor(?) benefit of making the epilogue loop's scalar result
6196 : : independent of the main loop's scalar result. */
6197 : 21180 : bool unify_with_main_loop_p = false;
6198 : 21180 : if (reduc_info->reused_accumulator
6199 : 4063 : && loop_vinfo->skip_this_loop_edge
6200 : 3869 : && single_succ_p (exit_bb)
6201 : 21193 : && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6202 : : {
6203 : 13 : unify_with_main_loop_p = true;
6204 : :
6205 : 13 : basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6206 : 13 : reduc_inputs[0] = make_ssa_name (vectype);
6207 : 13 : gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6208 : 13 : add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6209 : : UNKNOWN_LOCATION);
6210 : 13 : add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6211 : : loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6212 : 13 : exit_gsi = gsi_after_labels (reduc_block);
6213 : : }
6214 : :
6215 : : /* Shouldn't be used beyond this point. */
6216 : 21180 : exit_bb = nullptr;
6217 : :
6218 : 21180 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6219 : 73 : && reduc_fn != IFN_LAST)
6220 : : {
6221 : : /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6222 : : various data values where the condition matched and another vector
6223 : : (INDUCTION_INDEX) containing all the indexes of those matches. We
6224 : : need to extract the last matching index (which will be the index with
6225 : : highest value) and use this to index into the data vector.
6226 : : For the case where there were no matches, the data vector will contain
6227 : : all default values and the index vector will be all zeros. */
6228 : :
6229 : : /* Get various versions of the type of the vector of indexes. */
6230 : 4 : tree index_vec_type = TREE_TYPE (induction_index);
6231 : 4 : gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6232 : 4 : tree index_scalar_type = TREE_TYPE (index_vec_type);
6233 : 4 : tree index_vec_cmp_type = truth_type_for (index_vec_type);
6234 : :
6235 : : /* Get an unsigned integer version of the type of the data vector. */
6236 : 4 : int scalar_precision
6237 : 4 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6238 : 4 : tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6239 : 4 : tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6240 : : vectype);
6241 : :
6242 : : /* First we need to create a vector (ZERO_VEC) of zeros and another
6243 : : vector (MAX_INDEX_VEC) filled with the last matching index, which we
6244 : : can create using a MAX reduction and then expanding.
6245 : : In the case where the loop never made any matches, the max index will
6246 : : be zero. */
6247 : :
6248 : : /* Vector of {0, 0, 0,...}. */
6249 : 4 : tree zero_vec = build_zero_cst (vectype);
6250 : :
6251 : : /* Find maximum value from the vector of found indexes. */
6252 : 4 : tree max_index = make_ssa_name (index_scalar_type);
6253 : 4 : gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6254 : : 1, induction_index);
6255 : 4 : gimple_call_set_lhs (max_index_stmt, max_index);
6256 : 4 : gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6257 : :
6258 : : /* Vector of {max_index, max_index, max_index,...}. */
6259 : 4 : tree max_index_vec = make_ssa_name (index_vec_type);
6260 : 4 : tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6261 : : max_index);
6262 : 4 : gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6263 : : max_index_vec_rhs);
6264 : 4 : gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6265 : :
6266 : : /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6267 : : with the vector (INDUCTION_INDEX) of found indexes, choosing values
6268 : : from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6269 : : otherwise. Only one value should match, resulting in a vector
6270 : : (VEC_COND) with one data value and the rest zeros.
6271 : : In the case where the loop never made any matches, every index will
6272 : : match, resulting in a vector with all data values (which will all be
6273 : : the default value). */
6274 : :
6275 : : /* Compare the max index vector to the vector of found indexes to find
6276 : : the position of the max value. */
6277 : 4 : tree vec_compare = make_ssa_name (index_vec_cmp_type);
6278 : 4 : gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6279 : : induction_index,
6280 : : max_index_vec);
6281 : 4 : gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6282 : :
6283 : : /* Use the compare to choose either values from the data vector or
6284 : : zero. */
6285 : 4 : tree vec_cond = make_ssa_name (vectype);
6286 : 4 : gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6287 : : vec_compare,
6288 : 4 : reduc_inputs[0],
6289 : : zero_vec);
6290 : 4 : gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6291 : :
6292 : : /* Finally we need to extract the data value from the vector (VEC_COND)
6293 : : into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6294 : : reduction, but because this doesn't exist, we can use a MAX reduction
6295 : : instead. The data value might be signed or a float so we need to cast
6296 : : it first.
6297 : : In the case where the loop never made any matches, the data values are
6298 : : all identical, and so will reduce down correctly. */
6299 : :
6300 : : /* Make the matched data values unsigned. */
6301 : 4 : tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6302 : 4 : tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6303 : : vec_cond);
6304 : 4 : gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6305 : : VIEW_CONVERT_EXPR,
6306 : : vec_cond_cast_rhs);
6307 : 4 : gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6308 : :
6309 : : /* Reduce down to a scalar value. */
6310 : 4 : tree data_reduc = make_ssa_name (scalar_type_unsigned);
6311 : 4 : gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6312 : : 1, vec_cond_cast);
6313 : 4 : gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6314 : 4 : gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6315 : :
6316 : : /* Convert the reduced value back to the result type and set as the
6317 : : result. */
6318 : 4 : gimple_seq stmts = NULL;
6319 : 4 : new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6320 : : data_reduc);
6321 : 4 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6322 : 4 : scalar_results.safe_push (new_temp);
6323 : 4 : }
6324 : 21176 : else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6325 : 69 : && reduc_fn == IFN_LAST)
6326 : : {
6327 : : /* Condition reduction without supported IFN_REDUC_MAX. Generate
6328 : : idx = 0;
6329 : : idx_val = induction_index[0];
6330 : : val = data_reduc[0];
6331 : : for (idx = 0, val = init, i = 0; i < nelts; ++i)
6332 : : if (induction_index[i] > idx_val)
6333 : : val = data_reduc[i], idx_val = induction_index[i];
6334 : : return val; */
6335 : :
6336 : 69 : tree data_eltype = TREE_TYPE (vectype);
6337 : 69 : tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6338 : 69 : unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6339 : 69 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6340 : : /* Enforced by vectorizable_reduction, which ensures we have target
6341 : : support before allowing a conditional reduction on variable-length
6342 : : vectors. */
6343 : 69 : unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6344 : 69 : tree idx_val = NULL_TREE, val = NULL_TREE;
6345 : 461 : for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6346 : : {
6347 : 392 : tree old_idx_val = idx_val;
6348 : 392 : tree old_val = val;
6349 : 392 : idx_val = make_ssa_name (idx_eltype);
6350 : 392 : epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6351 : : build3 (BIT_FIELD_REF, idx_eltype,
6352 : : induction_index,
6353 : 392 : bitsize_int (el_size),
6354 : 392 : bitsize_int (off)));
6355 : 392 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6356 : 392 : val = make_ssa_name (data_eltype);
6357 : 784 : epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6358 : : build3 (BIT_FIELD_REF,
6359 : : data_eltype,
6360 : 392 : reduc_inputs[0],
6361 : 392 : bitsize_int (el_size),
6362 : 392 : bitsize_int (off)));
6363 : 392 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6364 : 392 : if (off != 0)
6365 : : {
6366 : 323 : tree new_idx_val = idx_val;
6367 : 323 : if (off != v_size - el_size)
6368 : : {
6369 : 254 : new_idx_val = make_ssa_name (idx_eltype);
6370 : 254 : epilog_stmt = gimple_build_assign (new_idx_val,
6371 : : MAX_EXPR, idx_val,
6372 : : old_idx_val);
6373 : 254 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6374 : : }
6375 : 323 : tree cond = make_ssa_name (boolean_type_node);
6376 : 323 : epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6377 : : idx_val, old_idx_val);
6378 : 323 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6379 : 323 : tree new_val = make_ssa_name (data_eltype);
6380 : 323 : epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6381 : : cond, val, old_val);
6382 : 323 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6383 : 323 : idx_val = new_idx_val;
6384 : 323 : val = new_val;
6385 : : }
6386 : : }
6387 : : /* Convert the reduced value back to the result type and set as the
6388 : : result. */
6389 : 69 : gimple_seq stmts = NULL;
6390 : 69 : val = gimple_convert (&stmts, scalar_type, val);
6391 : 69 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6392 : 69 : scalar_results.safe_push (val);
6393 : 69 : }
6394 : :
6395 : : /* 2.3 Create the reduction code, using one of the three schemes described
6396 : : above. In SLP we simply need to extract all the elements from the
6397 : : vector (without reducing them), so we use scalar shifts. */
6398 : 21107 : else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
6399 : : {
6400 : 19399 : tree tmp;
6401 : 19399 : tree vec_elem_type;
6402 : :
6403 : : /* Case 1: Create:
6404 : : v_out2 = reduc_expr <v_out1> */
6405 : :
6406 : 19399 : if (dump_enabled_p ())
6407 : 1272 : dump_printf_loc (MSG_NOTE, vect_location,
6408 : : "Reduce using direct vector reduction.\n");
6409 : :
6410 : 19399 : gimple_seq stmts = NULL;
6411 : 19399 : vec_elem_type = TREE_TYPE (vectype);
6412 : 19399 : new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6413 : 19399 : vec_elem_type, reduc_inputs[0]);
6414 : 19399 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6415 : 19399 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6416 : :
6417 : 19399 : if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6418 : 66 : && induc_val)
6419 : : {
6420 : : /* Earlier we set the initial value to be a vector if induc_val
6421 : : values. Check the result and if it is induc_val then replace
6422 : : with the original initial value, unless induc_val is
6423 : : the same as initial_def already. */
6424 : 63 : tree zcompare = make_ssa_name (boolean_type_node);
6425 : 63 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6426 : : new_temp, induc_val);
6427 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6428 : 63 : tree initial_def = reduc_info->reduc_initial_values[0];
6429 : 63 : tmp = make_ssa_name (new_scalar_dest);
6430 : 63 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6431 : : initial_def, new_temp);
6432 : 63 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6433 : 63 : new_temp = tmp;
6434 : : }
6435 : :
6436 : 19399 : scalar_results.safe_push (new_temp);
6437 : 19399 : }
6438 : 1562 : else if (direct_slp_reduc)
6439 : : {
6440 : : /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6441 : : with the elements for other SLP statements replaced with the
6442 : : neutral value. We can then do a normal reduction on each vector. */
6443 : :
6444 : : /* Enforced by vectorizable_reduction. */
6445 : : gcc_assert (reduc_inputs.length () == 1);
6446 : : gcc_assert (pow2p_hwi (group_size));
6447 : :
6448 : : gimple_seq seq = NULL;
6449 : :
6450 : : /* Build a vector {0, 1, 2, ...}, with the same number of elements
6451 : : and the same element size as VECTYPE. */
6452 : : tree index = build_index_vector (vectype, 0, 1);
6453 : : tree index_type = TREE_TYPE (index);
6454 : : tree index_elt_type = TREE_TYPE (index_type);
6455 : : tree mask_type = truth_type_for (index_type);
6456 : :
6457 : : /* Create a vector that, for each element, identifies which of
6458 : : the REDUC_GROUP_SIZE results should use it. */
6459 : : tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6460 : : index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6461 : : build_vector_from_val (index_type, index_mask));
6462 : :
6463 : : /* Get a neutral vector value. This is simply a splat of the neutral
6464 : : scalar value if we have one, otherwise the initial scalar value
6465 : : is itself a neutral value. */
6466 : : tree vector_identity = NULL_TREE;
6467 : : tree neutral_op = NULL_TREE;
6468 : : if (1)
6469 : : {
6470 : : tree initial_value = NULL_TREE;
6471 : : if (REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (reduc_info)))
6472 : : initial_value = reduc_info->reduc_initial_values[0];
6473 : : neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6474 : : initial_value, false);
6475 : : }
6476 : : if (neutral_op)
6477 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
6478 : : neutral_op);
6479 : : for (unsigned int i = 0; i < group_size; ++i)
6480 : : {
6481 : : /* If there's no univeral neutral value, we can use the
6482 : : initial scalar value from the original PHI. This is used
6483 : : for MIN and MAX reduction, for example. */
6484 : : if (!neutral_op)
6485 : : {
6486 : : tree scalar_value = reduc_info->reduc_initial_values[i];
6487 : : scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6488 : : scalar_value);
6489 : : vector_identity = gimple_build_vector_from_val (&seq, vectype,
6490 : : scalar_value);
6491 : : }
6492 : :
6493 : : /* Calculate the equivalent of:
6494 : :
6495 : : sel[j] = (index[j] == i);
6496 : :
6497 : : which selects the elements of REDUC_INPUTS[0] that should
6498 : : be included in the result. */
6499 : : tree compare_val = build_int_cst (index_elt_type, i);
6500 : : compare_val = build_vector_from_val (index_type, compare_val);
6501 : : tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6502 : : index, compare_val);
6503 : :
6504 : : /* Calculate the equivalent of:
6505 : :
6506 : : vec = seq ? reduc_inputs[0] : vector_identity;
6507 : :
6508 : : VEC is now suitable for a full vector reduction. */
6509 : : tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6510 : : sel, reduc_inputs[0], vector_identity);
6511 : :
6512 : : /* Do the reduction and convert it to the appropriate type. */
6513 : : tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6514 : : TREE_TYPE (vectype), vec);
6515 : : scalar = gimple_convert (&seq, scalar_type, scalar);
6516 : : scalar_results.safe_push (scalar);
6517 : : }
6518 : : gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6519 : : }
6520 : : else
6521 : : {
6522 : 1562 : bool reduce_with_shift;
6523 : 1562 : tree vec_temp;
6524 : :
6525 : 1562 : gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6526 : :
6527 : : /* See if the target wants to do the final (shift) reduction
6528 : : in a vector mode of smaller size and first reduce upper/lower
6529 : : halves against each other. */
6530 : 1708 : enum machine_mode mode1 = mode;
6531 : 1708 : tree stype = TREE_TYPE (vectype);
6532 : 1708 : unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6533 : 1708 : unsigned nunits1 = nunits;
6534 : 1708 : if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6535 : 1708 : && reduc_inputs.length () == 1)
6536 : : {
6537 : 37 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6538 : : /* For SLP reductions we have to make sure lanes match up, but
6539 : : since we're doing individual element final reduction reducing
6540 : : vector width here is even more important.
6541 : : ??? We can also separate lanes with permutes, for the common
6542 : : case of power-of-two group-size odd/even extracts would work. */
6543 : 37 : if (slp_reduc && nunits != nunits1)
6544 : : {
6545 : 37 : nunits1 = least_common_multiple (nunits1, group_size);
6546 : 74 : gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6547 : : }
6548 : : }
6549 : 1708 : if (!slp_reduc
6550 : 1708 : && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6551 : 0 : nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6552 : :
6553 : 1708 : tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6554 : 1708 : stype, nunits1);
6555 : 1708 : reduce_with_shift = have_whole_vector_shift (mode1);
6556 : 700 : if (!VECTOR_MODE_P (mode1)
6557 : 2408 : || !directly_supported_p (code, vectype1))
6558 : : reduce_with_shift = false;
6559 : :
6560 : : /* First reduce the vector to the desired vector size we should
6561 : : do shift reduction on by combining upper and lower halves. */
6562 : 1708 : gimple_seq stmts = NULL;
6563 : 1708 : new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6564 : : code, &stmts);
6565 : 1708 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6566 : 1708 : reduc_inputs[0] = new_temp;
6567 : :
6568 : 1708 : if (reduce_with_shift && (!slp_reduc || group_size == 1))
6569 : : {
6570 : 1521 : int element_bitsize = tree_to_uhwi (bitsize);
6571 : : /* Enforced by vectorizable_reduction, which disallows SLP reductions
6572 : : for variable-length vectors and also requires direct target support
6573 : : for loop reductions. */
6574 : 1521 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6575 : 1521 : int nelements = vec_size_in_bits / element_bitsize;
6576 : 1521 : vec_perm_builder sel;
6577 : 1521 : vec_perm_indices indices;
6578 : :
6579 : 1521 : int elt_offset;
6580 : :
6581 : 1521 : tree zero_vec = build_zero_cst (vectype1);
6582 : : /* Case 2: Create:
6583 : : for (offset = nelements/2; offset >= 1; offset/=2)
6584 : : {
6585 : : Create: va' = vec_shift <va, offset>
6586 : : Create: va = vop <va, va'>
6587 : : } */
6588 : :
6589 : 1521 : tree rhs;
6590 : :
6591 : 1521 : if (dump_enabled_p ())
6592 : 313 : dump_printf_loc (MSG_NOTE, vect_location,
6593 : : "Reduce using vector shifts\n");
6594 : :
6595 : 1521 : gimple_seq stmts = NULL;
6596 : 1521 : new_temp = gimple_convert (&stmts, vectype1, new_temp);
6597 : 1521 : for (elt_offset = nelements / 2;
6598 : 3312 : elt_offset >= 1;
6599 : 1791 : elt_offset /= 2)
6600 : : {
6601 : 1791 : calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6602 : 1791 : indices.new_vector (sel, 2, nelements);
6603 : 1791 : tree mask = vect_gen_perm_mask_any (vectype1, indices);
6604 : 1791 : new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6605 : : new_temp, zero_vec, mask);
6606 : 1791 : new_temp = gimple_build (&stmts, code,
6607 : : vectype1, new_name, new_temp);
6608 : : }
6609 : 1521 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6610 : :
6611 : : /* 2.4 Extract the final scalar result. Create:
6612 : : s_out3 = extract_field <v_out2, bitpos> */
6613 : :
6614 : 1521 : if (dump_enabled_p ())
6615 : 313 : dump_printf_loc (MSG_NOTE, vect_location,
6616 : : "extract scalar result\n");
6617 : :
6618 : 1521 : rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6619 : : bitsize, bitsize_zero_node);
6620 : 1521 : epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6621 : 1521 : new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6622 : 1521 : gimple_assign_set_lhs (epilog_stmt, new_temp);
6623 : 1521 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6624 : 1521 : scalar_results.safe_push (new_temp);
6625 : 1521 : }
6626 : : else
6627 : : {
6628 : : /* Case 3: Create:
6629 : : s = extract_field <v_out2, 0>
6630 : : for (offset = element_size;
6631 : : offset < vector_size;
6632 : : offset += element_size;)
6633 : : {
6634 : : Create: s' = extract_field <v_out2, offset>
6635 : : Create: s = op <s, s'> // For non SLP cases
6636 : : } */
6637 : :
6638 : 187 : if (dump_enabled_p ())
6639 : 112 : dump_printf_loc (MSG_NOTE, vect_location,
6640 : : "Reduce using scalar code.\n");
6641 : :
6642 : 187 : int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6643 : 187 : int element_bitsize = tree_to_uhwi (bitsize);
6644 : 187 : tree compute_type = TREE_TYPE (vectype);
6645 : 187 : gimple_seq stmts = NULL;
6646 : 419 : FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6647 : : {
6648 : 232 : int bit_offset;
6649 : 464 : new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6650 : 232 : vec_temp, bitsize, bitsize_zero_node);
6651 : :
6652 : : /* In SLP we don't need to apply reduction operation, so we just
6653 : : collect s' values in SCALAR_RESULTS. */
6654 : 232 : if (slp_reduc)
6655 : 222 : scalar_results.safe_push (new_temp);
6656 : :
6657 : 512 : for (bit_offset = element_bitsize;
6658 : 744 : bit_offset < vec_size_in_bits;
6659 : 512 : bit_offset += element_bitsize)
6660 : : {
6661 : 512 : tree bitpos = bitsize_int (bit_offset);
6662 : 512 : new_name = gimple_build (&stmts, BIT_FIELD_REF,
6663 : : compute_type, vec_temp,
6664 : : bitsize, bitpos);
6665 : 512 : if (slp_reduc)
6666 : : {
6667 : : /* In SLP we don't need to apply reduction operation, so
6668 : : we just collect s' values in SCALAR_RESULTS. */
6669 : 502 : new_temp = new_name;
6670 : 502 : scalar_results.safe_push (new_name);
6671 : : }
6672 : : else
6673 : 10 : new_temp = gimple_build (&stmts, code, compute_type,
6674 : : new_name, new_temp);
6675 : : }
6676 : : }
6677 : :
6678 : : /* The only case where we need to reduce scalar results in SLP, is
6679 : : unrolling. If the size of SCALAR_RESULTS is greater than
6680 : : REDUC_GROUP_SIZE, we reduce them combining elements modulo
6681 : : REDUC_GROUP_SIZE. */
6682 : 187 : if (slp_reduc)
6683 : : {
6684 : 177 : tree res, first_res, new_res;
6685 : :
6686 : : /* Reduce multiple scalar results in case of SLP unrolling. */
6687 : 424 : for (j = group_size; scalar_results.iterate (j, &res);
6688 : : j++)
6689 : : {
6690 : 247 : first_res = scalar_results[j % group_size];
6691 : 247 : new_res = gimple_build (&stmts, code, compute_type,
6692 : : first_res, res);
6693 : 247 : scalar_results[j % group_size] = new_res;
6694 : : }
6695 : 177 : scalar_results.truncate (group_size);
6696 : 831 : for (k = 0; k < group_size; k++)
6697 : 954 : scalar_results[k] = gimple_convert (&stmts, scalar_type,
6698 : 477 : scalar_results[k]);
6699 : : }
6700 : : else
6701 : : {
6702 : : /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6703 : 10 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6704 : 10 : scalar_results.safe_push (new_temp);
6705 : : }
6706 : :
6707 : 187 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6708 : : }
6709 : :
6710 : 1708 : if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6711 : 0 : && induc_val)
6712 : : {
6713 : : /* Earlier we set the initial value to be a vector if induc_val
6714 : : values. Check the result and if it is induc_val then replace
6715 : : with the original initial value, unless induc_val is
6716 : : the same as initial_def already. */
6717 : 0 : tree zcompare = make_ssa_name (boolean_type_node);
6718 : 0 : epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6719 : 0 : scalar_results[0], induc_val);
6720 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6721 : 0 : tree initial_def = reduc_info->reduc_initial_values[0];
6722 : 0 : tree tmp = make_ssa_name (new_scalar_dest);
6723 : 0 : epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6724 : 0 : initial_def, scalar_results[0]);
6725 : 0 : gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6726 : 0 : scalar_results[0] = tmp;
6727 : : }
6728 : : }
6729 : :
6730 : : /* 2.5 Adjust the final result by the initial value of the reduction
6731 : : variable. (When such adjustment is not needed, then
6732 : : 'adjustment_def' is zero). For example, if code is PLUS we create:
6733 : : new_temp = loop_exit_def + adjustment_def */
6734 : :
6735 : 21180 : if (adjustment_def)
6736 : : {
6737 : 15673 : gcc_assert (!slp_reduc || group_size == 1);
6738 : 15673 : gimple_seq stmts = NULL;
6739 : 15673 : if (double_reduc)
6740 : : {
6741 : 0 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6742 : 0 : adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6743 : 0 : new_temp = gimple_build (&stmts, code, vectype,
6744 : 0 : reduc_inputs[0], adjustment_def);
6745 : : }
6746 : : else
6747 : : {
6748 : 15673 : new_temp = scalar_results[0];
6749 : 15673 : gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6750 : 15673 : adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6751 : : adjustment_def);
6752 : 15673 : new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6753 : 15673 : new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6754 : : new_temp, adjustment_def);
6755 : 15673 : new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6756 : : }
6757 : :
6758 : 15673 : epilog_stmt = gimple_seq_last_stmt (stmts);
6759 : 15673 : gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6760 : 15673 : scalar_results[0] = new_temp;
6761 : : }
6762 : :
6763 : : /* Record this operation if it could be reused by the epilogue loop. */
6764 : 21180 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6765 : 21180 : && reduc_inputs.length () == 1)
6766 : 21006 : loop_vinfo->reusable_accumulators.put (scalar_results[0],
6767 : : { orig_reduc_input, reduc_info });
6768 : :
6769 : 21180 : if (double_reduc)
6770 : 66 : loop = outer_loop;
6771 : :
6772 : : /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6773 : : phis with new adjusted scalar results, i.e., replace use <s_out0>
6774 : : with use <s_out4>.
6775 : :
6776 : : Transform:
6777 : : loop_exit:
6778 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6779 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6780 : : v_out2 = reduce <v_out1>
6781 : : s_out3 = extract_field <v_out2, 0>
6782 : : s_out4 = adjust_result <s_out3>
6783 : : use <s_out0>
6784 : : use <s_out0>
6785 : :
6786 : : into:
6787 : :
6788 : : loop_exit:
6789 : : s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6790 : : v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6791 : : v_out2 = reduce <v_out1>
6792 : : s_out3 = extract_field <v_out2, 0>
6793 : : s_out4 = adjust_result <s_out3>
6794 : : use <s_out4>
6795 : : use <s_out4> */
6796 : :
6797 : 42360 : gcc_assert (live_out_stmts.size () == scalar_results.length ());
6798 : 21180 : auto_vec<gimple *> phis;
6799 : 42660 : for (k = 0; k < live_out_stmts.size (); k++)
6800 : : {
6801 : 21480 : stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6802 : 21480 : scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6803 : :
6804 : : /* Find the loop-closed-use at the loop exit of the original scalar
6805 : : result. (The reduction result is expected to have two immediate uses,
6806 : : one at the latch block, and one at the loop exit). For double
6807 : : reductions we are looking for exit phis of the outer loop. */
6808 : 88613 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6809 : : {
6810 : 67133 : if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6811 : : {
6812 : 21475 : if (!is_gimple_debug (USE_STMT (use_p))
6813 : 21475 : && gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6814 : 21467 : phis.safe_push (USE_STMT (use_p));
6815 : : }
6816 : : else
6817 : : {
6818 : 45658 : if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6819 : : {
6820 : 66 : tree phi_res = PHI_RESULT (USE_STMT (use_p));
6821 : :
6822 : 132 : FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6823 : : {
6824 : 66 : if (!flow_bb_inside_loop_p (loop,
6825 : 66 : gimple_bb (USE_STMT (phi_use_p)))
6826 : 66 : && !is_gimple_debug (USE_STMT (phi_use_p)))
6827 : 0 : phis.safe_push (USE_STMT (phi_use_p));
6828 : : }
6829 : : }
6830 : : }
6831 : : }
6832 : :
6833 : 42947 : FOR_EACH_VEC_ELT (phis, i, exit_phi)
6834 : : {
6835 : : /* Replace the uses: */
6836 : 21467 : orig_name = PHI_RESULT (exit_phi);
6837 : :
6838 : : /* Look for a single use at the target of the skip edge. */
6839 : 21467 : if (unify_with_main_loop_p)
6840 : : {
6841 : 27 : use_operand_p use_p;
6842 : 27 : gimple *user;
6843 : 27 : if (!single_imm_use (orig_name, &use_p, &user))
6844 : 0 : gcc_unreachable ();
6845 : 27 : orig_name = gimple_get_lhs (user);
6846 : : }
6847 : :
6848 : 21467 : scalar_result = scalar_results[k];
6849 : 58577 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6850 : : {
6851 : 111374 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6852 : 37132 : SET_USE (use_p, scalar_result);
6853 : 37110 : update_stmt (use_stmt);
6854 : 21467 : }
6855 : : }
6856 : :
6857 : 21480 : phis.truncate (0);
6858 : : }
6859 : 21180 : }
6860 : :
6861 : : /* Return a vector of type VECTYPE that is equal to the vector select
6862 : : operation "MASK ? VEC : IDENTITY". Insert the select statements
6863 : : before GSI. */
6864 : :
6865 : : static tree
6866 : 0 : merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6867 : : tree vec, tree identity)
6868 : : {
6869 : 0 : tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6870 : 0 : gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6871 : : mask, vec, identity);
6872 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6873 : 0 : return cond;
6874 : : }
6875 : :
6876 : : /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6877 : : order, starting with LHS. Insert the extraction statements before GSI and
6878 : : associate the new scalar SSA names with variable SCALAR_DEST.
6879 : : If MASK is nonzero mask the input and then operate on it unconditionally.
6880 : : Return the SSA name for the result. */
6881 : :
6882 : : static tree
6883 : 995 : vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6884 : : tree_code code, tree lhs, tree vector_rhs,
6885 : : tree mask)
6886 : : {
6887 : 995 : tree vectype = TREE_TYPE (vector_rhs);
6888 : 995 : tree scalar_type = TREE_TYPE (vectype);
6889 : 995 : tree bitsize = TYPE_SIZE (scalar_type);
6890 : 995 : unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6891 : 995 : unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6892 : :
6893 : : /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6894 : : to perform an unconditional element-wise reduction of it. */
6895 : 995 : if (mask)
6896 : : {
6897 : 7 : tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6898 : : "masked_vector_rhs");
6899 : 7 : tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6900 : : false);
6901 : 7 : tree vector_identity = build_vector_from_val (vectype, neutral_op);
6902 : 7 : gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6903 : : mask, vector_rhs, vector_identity);
6904 : 7 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
6905 : 7 : vector_rhs = masked_vector_rhs;
6906 : : }
6907 : :
6908 : 995 : for (unsigned HOST_WIDE_INT bit_offset = 0;
6909 : 4259 : bit_offset < vec_size_in_bits;
6910 : 3264 : bit_offset += element_bitsize)
6911 : : {
6912 : 3264 : tree bitpos = bitsize_int (bit_offset);
6913 : 3264 : tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6914 : : bitsize, bitpos);
6915 : :
6916 : 3264 : gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6917 : 3264 : rhs = make_ssa_name (scalar_dest, stmt);
6918 : 3264 : gimple_assign_set_lhs (stmt, rhs);
6919 : 3264 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6920 : : /* Fold the vector extract, combining it with a previous reversal
6921 : : like seen in PR90579. */
6922 : 3264 : auto gsi2 = gsi_for_stmt (stmt);
6923 : 3264 : if (fold_stmt (&gsi2, follow_all_ssa_edges))
6924 : 356 : update_stmt (gsi_stmt (gsi2));
6925 : :
6926 : 3264 : stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6927 : 3264 : tree new_name = make_ssa_name (scalar_dest, stmt);
6928 : 3264 : gimple_assign_set_lhs (stmt, new_name);
6929 : 3264 : gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6930 : 3264 : lhs = new_name;
6931 : : }
6932 : 995 : return lhs;
6933 : : }
6934 : :
6935 : : /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6936 : : type of the vector input. */
6937 : :
6938 : : static internal_fn
6939 : 842 : get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6940 : : {
6941 : 842 : internal_fn mask_reduc_fn;
6942 : 842 : internal_fn mask_len_reduc_fn;
6943 : :
6944 : 842 : switch (reduc_fn)
6945 : : {
6946 : 0 : case IFN_FOLD_LEFT_PLUS:
6947 : 0 : mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6948 : 0 : mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6949 : 0 : break;
6950 : :
6951 : : default:
6952 : : return IFN_LAST;
6953 : : }
6954 : :
6955 : 0 : if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6956 : : OPTIMIZE_FOR_SPEED))
6957 : : return mask_reduc_fn;
6958 : 0 : if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6959 : : OPTIMIZE_FOR_SPEED))
6960 : : return mask_len_reduc_fn;
6961 : : return IFN_LAST;
6962 : : }
6963 : :
6964 : : /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6965 : : statement that sets the live-out value. REDUC_DEF_STMT is the phi
6966 : : statement. CODE is the operation performed by STMT_INFO and OPS are
6967 : : its scalar operands. REDUC_INDEX is the index of the operand in
6968 : : OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6969 : : implements in-order reduction, or IFN_LAST if we should open-code it.
6970 : : VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6971 : : that should be used to control the operation in a fully-masked loop. */
6972 : :
6973 : : static bool
6974 : 834 : vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6975 : : stmt_vec_info stmt_info,
6976 : : gimple_stmt_iterator *gsi,
6977 : : slp_tree slp_node,
6978 : : gimple *reduc_def_stmt,
6979 : : code_helper code, internal_fn reduc_fn,
6980 : : int num_ops, tree vectype_in,
6981 : : int reduc_index, vec_loop_masks *masks,
6982 : : vec_loop_lens *lens)
6983 : : {
6984 : 834 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6985 : 834 : tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6986 : 834 : internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6987 : :
6988 : 834 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6989 : :
6990 : 834 : bool is_cond_op = false;
6991 : 834 : if (!code.is_tree_code ())
6992 : : {
6993 : 7 : code = conditional_internal_fn_code (internal_fn (code));
6994 : 7 : gcc_assert (code != ERROR_MARK);
6995 : : is_cond_op = true;
6996 : : }
6997 : :
6998 : 834 : gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
6999 : :
7000 : 834 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7001 : : TYPE_VECTOR_SUBPARTS (vectype_in)));
7002 : :
7003 : : /* The operands either come from a binary operation or an IFN_COND operation.
7004 : : The former is a gimple assign with binary rhs and the latter is a
7005 : : gimple call with four arguments. */
7006 : 834 : gcc_assert (num_ops == 2 || num_ops == 4);
7007 : :
7008 : 834 : int group_size = 1;
7009 : 834 : stmt_vec_info scalar_dest_def_info;
7010 : 834 : auto_vec<tree> vec_oprnds0, vec_opmask;
7011 : 834 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
7012 : 834 : + (1 - reduc_index)],
7013 : : &vec_oprnds0);
7014 : 834 : group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7015 : 834 : scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7016 : : /* For an IFN_COND_OP we also need the vector mask operand. */
7017 : 834 : if (is_cond_op)
7018 : 7 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
7019 : :
7020 : 834 : gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7021 : 834 : tree scalar_dest = gimple_get_lhs (sdef);
7022 : 834 : tree scalar_type = TREE_TYPE (scalar_dest);
7023 : 834 : tree reduc_var = gimple_phi_result (reduc_def_stmt);
7024 : :
7025 : 834 : int vec_num = vec_oprnds0.length ();
7026 : 834 : tree vec_elem_type = TREE_TYPE (vectype_out);
7027 : 834 : gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7028 : :
7029 : 834 : tree vector_identity = NULL_TREE;
7030 : 834 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7031 : : {
7032 : 0 : vector_identity = build_zero_cst (vectype_out);
7033 : 0 : if (!HONOR_SIGNED_ZEROS (vectype_out))
7034 : : ;
7035 : : else
7036 : : {
7037 : 0 : gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7038 : 0 : vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7039 : : vector_identity);
7040 : : }
7041 : : }
7042 : :
7043 : 834 : tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7044 : 834 : int i;
7045 : 834 : tree def0;
7046 : 1829 : FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7047 : : {
7048 : 995 : gimple *new_stmt;
7049 : 995 : tree mask = NULL_TREE;
7050 : 995 : tree len = NULL_TREE;
7051 : 995 : tree bias = NULL_TREE;
7052 : 995 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7053 : : {
7054 : 0 : tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7055 : : vec_num, vectype_in, i);
7056 : 0 : if (is_cond_op)
7057 : 0 : mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
7058 : 0 : loop_mask, vec_opmask[i], gsi);
7059 : : else
7060 : : mask = loop_mask;
7061 : : }
7062 : 995 : else if (is_cond_op)
7063 : 7 : mask = vec_opmask[i];
7064 : 995 : if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7065 : : {
7066 : 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7067 : : i, 1);
7068 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7069 : 0 : bias = build_int_cst (intQI_type_node, biasval);
7070 : 0 : if (!is_cond_op)
7071 : 0 : mask = build_minus_one_cst (truth_type_for (vectype_in));
7072 : : }
7073 : :
7074 : : /* Handle MINUS by adding the negative. */
7075 : 995 : if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7076 : : {
7077 : 0 : tree negated = make_ssa_name (vectype_out);
7078 : 0 : new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7079 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7080 : 0 : def0 = negated;
7081 : : }
7082 : :
7083 : 0 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7084 : 995 : && mask && mask_reduc_fn == IFN_LAST)
7085 : 0 : def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7086 : : vector_identity);
7087 : :
7088 : : /* On the first iteration the input is simply the scalar phi
7089 : : result, and for subsequent iterations it is the output of
7090 : : the preceding operation. */
7091 : 995 : if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7092 : : {
7093 : 0 : if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7094 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7095 : : def0, mask, len, bias);
7096 : 0 : else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7097 : 0 : new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7098 : : def0, mask);
7099 : : else
7100 : 0 : new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7101 : : def0);
7102 : : /* For chained SLP reductions the output of the previous reduction
7103 : : operation serves as the input of the next. For the final statement
7104 : : the output cannot be a temporary - we reuse the original
7105 : : scalar destination of the last statement. */
7106 : 0 : if (i != vec_num - 1)
7107 : : {
7108 : 0 : gimple_set_lhs (new_stmt, scalar_dest_var);
7109 : 0 : reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7110 : 0 : gimple_set_lhs (new_stmt, reduc_var);
7111 : : }
7112 : : }
7113 : : else
7114 : : {
7115 : 995 : reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7116 : : tree_code (code), reduc_var, def0,
7117 : : mask);
7118 : 995 : new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7119 : : /* Remove the statement, so that we can use the same code paths
7120 : : as for statements that we've just created. */
7121 : 995 : gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7122 : 995 : gsi_remove (&tmp_gsi, true);
7123 : : }
7124 : :
7125 : 995 : if (i == vec_num - 1)
7126 : : {
7127 : 834 : gimple_set_lhs (new_stmt, scalar_dest);
7128 : 834 : vect_finish_replace_stmt (loop_vinfo,
7129 : : scalar_dest_def_info,
7130 : : new_stmt);
7131 : : }
7132 : : else
7133 : 161 : vect_finish_stmt_generation (loop_vinfo,
7134 : : scalar_dest_def_info,
7135 : : new_stmt, gsi);
7136 : :
7137 : 995 : slp_node->push_vec_def (new_stmt);
7138 : : }
7139 : :
7140 : 834 : return true;
7141 : 834 : }
7142 : :
7143 : : /* Function is_nonwrapping_integer_induction.
7144 : :
7145 : : Check if STMT_VINO (which is part of loop LOOP) both increments and
7146 : : does not cause overflow. */
7147 : :
7148 : : static bool
7149 : 379 : is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7150 : : {
7151 : 379 : gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7152 : 379 : tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7153 : 379 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7154 : 379 : tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7155 : 379 : widest_int ni, max_loop_value, lhs_max;
7156 : 379 : wi::overflow_type overflow = wi::OVF_NONE;
7157 : :
7158 : : /* Make sure the loop is integer based. */
7159 : 379 : if (TREE_CODE (base) != INTEGER_CST
7160 : 114 : || TREE_CODE (step) != INTEGER_CST)
7161 : : return false;
7162 : :
7163 : : /* Check that the max size of the loop will not wrap. */
7164 : :
7165 : 114 : if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7166 : : return true;
7167 : :
7168 : 8 : if (! max_stmt_executions (loop, &ni))
7169 : : return false;
7170 : :
7171 : 8 : max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7172 : 8 : &overflow);
7173 : 8 : if (overflow)
7174 : : return false;
7175 : :
7176 : 8 : max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7177 : 16 : TYPE_SIGN (lhs_type), &overflow);
7178 : 8 : if (overflow)
7179 : : return false;
7180 : :
7181 : 8 : return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7182 : 8 : <= TYPE_PRECISION (lhs_type));
7183 : 379 : }
7184 : :
7185 : : /* Check if masking can be supported by inserting a conditional expression.
7186 : : CODE is the code for the operation. COND_FN is the conditional internal
7187 : : function, if it exists. VECTYPE_IN is the type of the vector input. */
7188 : : static bool
7189 : 2305 : use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7190 : : tree vectype_in)
7191 : : {
7192 : 2305 : if (cond_fn != IFN_LAST
7193 : 2305 : && direct_internal_fn_supported_p (cond_fn, vectype_in,
7194 : : OPTIMIZE_FOR_SPEED))
7195 : : return false;
7196 : :
7197 : 2259 : if (code.is_tree_code ())
7198 : 1984 : switch (tree_code (code))
7199 : : {
7200 : : case DOT_PROD_EXPR:
7201 : : case SAD_EXPR:
7202 : : return true;
7203 : :
7204 : : default:
7205 : : break;
7206 : : }
7207 : : return false;
7208 : : }
7209 : :
7210 : : /* Insert a conditional expression to enable masked vectorization. CODE is the
7211 : : code for the operation. VOP is the array of operands. MASK is the loop
7212 : : mask. GSI is a statement iterator used to place the new conditional
7213 : : expression. */
7214 : : static void
7215 : 4 : build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7216 : : gimple_stmt_iterator *gsi)
7217 : : {
7218 : 4 : switch (tree_code (code))
7219 : : {
7220 : 4 : case DOT_PROD_EXPR:
7221 : 4 : {
7222 : 4 : tree vectype = TREE_TYPE (vop[1]);
7223 : 4 : tree zero = build_zero_cst (vectype);
7224 : 4 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7225 : 4 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7226 : : mask, vop[1], zero);
7227 : 4 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
7228 : 4 : vop[1] = masked_op1;
7229 : 4 : break;
7230 : : }
7231 : :
7232 : 0 : case SAD_EXPR:
7233 : 0 : {
7234 : 0 : tree vectype = TREE_TYPE (vop[1]);
7235 : 0 : tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7236 : 0 : gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7237 : : mask, vop[1], vop[0]);
7238 : 0 : gsi_insert_before (gsi, select, GSI_SAME_STMT);
7239 : 0 : vop[1] = masked_op1;
7240 : 0 : break;
7241 : : }
7242 : :
7243 : 0 : default:
7244 : 0 : gcc_unreachable ();
7245 : : }
7246 : 4 : }
7247 : :
7248 : : /* Given an operation with CODE in loop reduction path whose reduction PHI is
7249 : : specified by REDUC_INFO, the operation has TYPE of scalar result, and its
7250 : : input vectype is represented by VECTYPE_IN. The vectype of vectorized result
7251 : : may be different from VECTYPE_IN, either in base type or vectype lanes,
7252 : : lane-reducing operation is the case. This function check if it is possible,
7253 : : and how to perform partial vectorization on the operation in the context
7254 : : of LOOP_VINFO. */
7255 : :
7256 : : static void
7257 : 8 : vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
7258 : : stmt_vec_info reduc_info,
7259 : : slp_tree slp_node,
7260 : : code_helper code, tree type,
7261 : : tree vectype_in)
7262 : : {
7263 : 8 : enum vect_reduction_type reduc_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7264 : 8 : internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7265 : 8 : internal_fn cond_fn = get_conditional_internal_fn (code, type);
7266 : :
7267 : 8 : if (reduc_type != FOLD_LEFT_REDUCTION
7268 : 8 : && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7269 : 12 : && (cond_fn == IFN_LAST
7270 : 4 : || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7271 : : OPTIMIZE_FOR_SPEED)))
7272 : : {
7273 : 0 : if (dump_enabled_p ())
7274 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7275 : : "can't operate on partial vectors because"
7276 : : " no conditional operation is available.\n");
7277 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7278 : : }
7279 : 8 : else if (reduc_type == FOLD_LEFT_REDUCTION
7280 : 8 : && reduc_fn == IFN_LAST
7281 : 8 : && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in)))
7282 : : {
7283 : 0 : if (dump_enabled_p ())
7284 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7285 : : "can't operate on partial vectors because"
7286 : : " no conditional operation is available.\n");
7287 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7288 : : }
7289 : 8 : else if (reduc_type == FOLD_LEFT_REDUCTION
7290 : 0 : && internal_fn_mask_index (reduc_fn) == -1
7291 : 0 : && FLOAT_TYPE_P (vectype_in)
7292 : 8 : && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
7293 : : {
7294 : 0 : if (dump_enabled_p ())
7295 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7296 : : "can't operate on partial vectors because"
7297 : : " signed zeros cannot be preserved.\n");
7298 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7299 : : }
7300 : : else
7301 : : {
7302 : 8 : internal_fn mask_reduc_fn
7303 : 8 : = get_masked_reduction_fn (reduc_fn, vectype_in);
7304 : 8 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7305 : 8 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7306 : 8 : unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node,
7307 : : vectype_in);
7308 : :
7309 : 8 : if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7310 : 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
7311 : : else
7312 : 8 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
7313 : : }
7314 : 8 : }
7315 : :
7316 : : /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
7317 : : the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
7318 : : and the analysis is for slp if SLP_NODE is not NULL.
7319 : :
7320 : : For a lane-reducing operation, the loop reduction path that it lies in,
7321 : : may contain normal operation, or other lane-reducing operation of different
7322 : : input type size, an example as:
7323 : :
7324 : : int sum = 0;
7325 : : for (i)
7326 : : {
7327 : : ...
7328 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
7329 : : sum += w[i]; // widen-sum <vector(16) char>
7330 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
7331 : : sum += n[i]; // normal <vector(4) int>
7332 : : ...
7333 : : }
7334 : :
7335 : : Vectorization factor is essentially determined by operation whose input
7336 : : vectype has the most lanes ("vector(16) char" in the example), while we
7337 : : need to choose input vectype with the least lanes ("vector(4) int" in the
7338 : : example) to determine effective number of vector reduction PHIs. */
7339 : :
7340 : : bool
7341 : 456329 : vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7342 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7343 : : {
7344 : 456329 : gimple *stmt = stmt_info->stmt;
7345 : :
7346 : 456329 : if (!lane_reducing_stmt_p (stmt))
7347 : : return false;
7348 : :
7349 : 412 : tree type = TREE_TYPE (gimple_assign_lhs (stmt));
7350 : :
7351 : 412 : if (!INTEGRAL_TYPE_P (type))
7352 : : return false;
7353 : :
7354 : : /* Do not try to vectorize bit-precision reductions. */
7355 : 412 : if (!type_has_mode_precision_p (type))
7356 : : return false;
7357 : :
7358 : 412 : stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7359 : :
7360 : : /* TODO: Support lane-reducing operation that does not directly participate
7361 : : in loop reduction. */
7362 : 412 : if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
7363 : : return false;
7364 : :
7365 : : /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
7366 : : recoginized. */
7367 : 412 : gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
7368 : 412 : gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
7369 : :
7370 : 1648 : for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
7371 : : {
7372 : 1236 : stmt_vec_info def_stmt_info;
7373 : 1236 : slp_tree slp_op;
7374 : 1236 : tree op;
7375 : 1236 : tree vectype;
7376 : 1236 : enum vect_def_type dt;
7377 : :
7378 : 1236 : if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
7379 : : &slp_op, &dt, &vectype, &def_stmt_info))
7380 : : {
7381 : 0 : if (dump_enabled_p ())
7382 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7383 : : "use not simple.\n");
7384 : 0 : return false;
7385 : : }
7386 : :
7387 : 1236 : if (!vectype)
7388 : : {
7389 : 18 : vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
7390 : : slp_op);
7391 : 18 : if (!vectype)
7392 : : return false;
7393 : : }
7394 : :
7395 : 1236 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype))
7396 : : {
7397 : 0 : if (dump_enabled_p ())
7398 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7399 : : "incompatible vector types for invariants\n");
7400 : 0 : return false;
7401 : : }
7402 : :
7403 : 1236 : if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7404 : 412 : continue;
7405 : :
7406 : : /* There should be at most one cycle def in the stmt. */
7407 : 824 : if (VECTORIZABLE_CYCLE_DEF (dt))
7408 : : return false;
7409 : : }
7410 : :
7411 : 412 : tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
7412 : :
7413 : 412 : gcc_assert (vectype_in);
7414 : :
7415 : : /* Compute number of effective vector statements for costing. */
7416 : 412 : unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, slp_node,
7417 : : vectype_in);
7418 : 412 : gcc_assert (ncopies_for_cost >= 1);
7419 : :
7420 : 412 : if (vect_is_emulated_mixed_dot_prod (stmt_info))
7421 : : {
7422 : : /* We need extra two invariants: one that contains the minimum signed
7423 : : value and one that contains half of its negative. */
7424 : 6 : int prologue_stmts = 2;
7425 : 6 : unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
7426 : : scalar_to_vec, stmt_info, 0,
7427 : : vect_prologue);
7428 : 6 : if (dump_enabled_p ())
7429 : 0 : dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
7430 : : "extra prologue_cost = %d .\n", cost);
7431 : :
7432 : : /* Three dot-products and a subtraction. */
7433 : 6 : ncopies_for_cost *= 4;
7434 : : }
7435 : :
7436 : 412 : record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, stmt_info,
7437 : : 0, vect_body);
7438 : :
7439 : 412 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7440 : : {
7441 : 4 : enum tree_code code = gimple_assign_rhs_code (stmt);
7442 : 4 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7443 : 4 : slp_node, code, type,
7444 : : vectype_in);
7445 : : }
7446 : :
7447 : : /* Transform via vect_transform_reduction. */
7448 : 412 : STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7449 : 412 : return true;
7450 : : }
7451 : :
7452 : : /* Function vectorizable_reduction.
7453 : :
7454 : : Check if STMT_INFO performs a reduction operation that can be vectorized.
7455 : : If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7456 : : stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7457 : : Return true if STMT_INFO is vectorizable in this way.
7458 : :
7459 : : This function also handles reduction idioms (patterns) that have been
7460 : : recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7461 : : may be of this form:
7462 : : X = pattern_expr (arg0, arg1, ..., X)
7463 : : and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7464 : : sequence that had been detected and replaced by the pattern-stmt
7465 : : (STMT_INFO).
7466 : :
7467 : : This function also handles reduction of condition expressions, for example:
7468 : : for (int i = 0; i < N; i++)
7469 : : if (a[i] < value)
7470 : : last = a[i];
7471 : : This is handled by vectorising the loop and creating an additional vector
7472 : : containing the loop indexes for which "a[i] < value" was true. In the
7473 : : function epilogue this is reduced to a single max value and then used to
7474 : : index into the vector of results.
7475 : :
7476 : : In some cases of reduction patterns, the type of the reduction variable X is
7477 : : different than the type of the other arguments of STMT_INFO.
7478 : : In such cases, the vectype that is used when transforming STMT_INFO into
7479 : : a vector stmt is different than the vectype that is used to determine the
7480 : : vectorization factor, because it consists of a different number of elements
7481 : : than the actual number of elements that are being operated upon in parallel.
7482 : :
7483 : : For example, consider an accumulation of shorts into an int accumulator.
7484 : : On some targets it's possible to vectorize this pattern operating on 8
7485 : : shorts at a time (hence, the vectype for purposes of determining the
7486 : : vectorization factor should be V8HI); on the other hand, the vectype that
7487 : : is used to create the vector form is actually V4SI (the type of the result).
7488 : :
7489 : : Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7490 : : indicates what is the actual level of parallelism (V8HI in the example), so
7491 : : that the right vectorization factor would be derived. This vectype
7492 : : corresponds to the type of arguments to the reduction stmt, and should *NOT*
7493 : : be used to create the vectorized stmt. The right vectype for the vectorized
7494 : : stmt is obtained from the type of the result X:
7495 : : get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7496 : :
7497 : : This means that, contrary to "regular" reductions (or "regular" stmts in
7498 : : general), the following equation:
7499 : : STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7500 : : does *NOT* necessarily hold for reduction patterns. */
7501 : :
7502 : : bool
7503 : 455917 : vectorizable_reduction (loop_vec_info loop_vinfo,
7504 : : stmt_vec_info stmt_info, slp_tree slp_node,
7505 : : slp_instance slp_node_instance,
7506 : : stmt_vector_for_cost *cost_vec)
7507 : : {
7508 : 455917 : tree vectype_in = NULL_TREE;
7509 : 455917 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7510 : 455917 : enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7511 : 455917 : stmt_vec_info cond_stmt_vinfo = NULL;
7512 : 455917 : int i;
7513 : 455917 : int ncopies;
7514 : 455917 : bool single_defuse_cycle = false;
7515 : 455917 : bool nested_cycle = false;
7516 : 455917 : bool double_reduc = false;
7517 : 455917 : tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7518 : 455917 : tree cond_reduc_val = NULL_TREE;
7519 : :
7520 : : /* Make sure it was already recognized as a reduction computation. */
7521 : 455917 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7522 : : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7523 : 455917 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7524 : : return false;
7525 : :
7526 : : /* The stmt we store reduction analysis meta on. */
7527 : 63181 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7528 : 63181 : reduc_info->is_reduc_info = true;
7529 : :
7530 : 63181 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7531 : : {
7532 : 1793 : if (is_a <gphi *> (stmt_info->stmt))
7533 : : {
7534 : : /* We eventually need to set a vector type on invariant
7535 : : arguments. */
7536 : : unsigned j;
7537 : : slp_tree child;
7538 : 5379 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7539 : 3586 : if (!vect_maybe_update_slp_op_vectype
7540 : 3586 : (child, SLP_TREE_VECTYPE (slp_node)))
7541 : : {
7542 : 0 : if (dump_enabled_p ())
7543 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7544 : : "incompatible vector types for "
7545 : : "invariants\n");
7546 : 0 : return false;
7547 : : }
7548 : : /* Analysis for double-reduction is done on the outer
7549 : : loop PHI, nested cycles have no further restrictions. */
7550 : 1793 : STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7551 : : }
7552 : : else
7553 : 0 : STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7554 : 1793 : return true;
7555 : : }
7556 : :
7557 : 61388 : stmt_vec_info orig_stmt_of_analysis = stmt_info;
7558 : 61388 : stmt_vec_info phi_info = stmt_info;
7559 : 61388 : if (!is_a <gphi *> (stmt_info->stmt))
7560 : : {
7561 : 7294 : STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7562 : 7294 : return true;
7563 : : }
7564 : 54094 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7565 : : {
7566 : 382 : if (gimple_bb (stmt_info->stmt) != loop->header)
7567 : : {
7568 : : /* For SLP we arrive here for both the inner loop LC PHI and
7569 : : the outer loop PHI. The latter is what we want to analyze
7570 : : the reduction with. The LC PHI is handled by
7571 : : vectorizable_lc_phi. */
7572 : 109 : return gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) == 2;
7573 : : }
7574 : 273 : use_operand_p use_p;
7575 : 273 : gimple *use_stmt;
7576 : 273 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7577 : : &use_p, &use_stmt);
7578 : 273 : gcc_assert (res);
7579 : 273 : phi_info = loop_vinfo->lookup_stmt (use_stmt);
7580 : : }
7581 : :
7582 : 53985 : slp_node_instance->reduc_phis = slp_node;
7583 : : /* ??? We're leaving slp_node to point to the PHIs, we only
7584 : : need it to get at the number of vector stmts which wasn't
7585 : : yet initialized for the instance root. */
7586 : :
7587 : : /* PHIs should not participate in patterns. */
7588 : 53985 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7589 : 53985 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7590 : :
7591 : : /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7592 : : and compute the reduction chain length. Discover the real
7593 : : reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7594 : 53985 : tree reduc_def
7595 : 53985 : = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7596 : : loop_latch_edge
7597 : : (gimple_bb (reduc_def_phi)->loop_father));
7598 : 53985 : unsigned reduc_chain_length = 0;
7599 : 53985 : bool only_slp_reduc_chain = true;
7600 : 53985 : stmt_info = NULL;
7601 : 53985 : slp_tree slp_for_stmt_info = slp_node_instance->root;
7602 : : /* For double-reductions we start SLP analysis at the inner loop LC PHI
7603 : : which is the def of the outer loop live stmt. */
7604 : 53985 : if (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def)
7605 : 273 : slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7606 : 118473 : while (reduc_def != PHI_RESULT (reduc_def_phi))
7607 : : {
7608 : 64512 : stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7609 : 64512 : stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7610 : 64512 : int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
7611 : :
7612 : 64512 : if (reduc_idx == -1)
7613 : : {
7614 : 0 : if (dump_enabled_p ())
7615 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7616 : : "reduction chain broken by patterns.\n");
7617 : 24 : return false;
7618 : : }
7619 : 64512 : if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7620 : 62757 : only_slp_reduc_chain = false;
7621 : : /* For epilogue generation live members of the chain need
7622 : : to point back to the PHI via their original stmt for
7623 : : info_for_reduction to work. For SLP we need to look at
7624 : : all lanes here - even though we only will vectorize from
7625 : : the SLP node with live lane zero the other live lanes also
7626 : : need to be identified as part of a reduction to be able
7627 : : to skip code generation for them. */
7628 : 64512 : if (slp_for_stmt_info)
7629 : : {
7630 : 275689 : for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7631 : 82153 : if (STMT_VINFO_LIVE_P (s))
7632 : 63874 : STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7633 : : }
7634 : 0 : else if (STMT_VINFO_LIVE_P (vdef))
7635 : 0 : STMT_VINFO_REDUC_DEF (def) = phi_info;
7636 : 64512 : gimple_match_op op;
7637 : 64512 : if (!gimple_extract_op (vdef->stmt, &op))
7638 : : {
7639 : 0 : if (dump_enabled_p ())
7640 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7641 : : "reduction chain includes unsupported"
7642 : : " statement type.\n");
7643 : 0 : return false;
7644 : : }
7645 : 64512 : if (CONVERT_EXPR_CODE_P (op.code))
7646 : : {
7647 : 4322 : if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7648 : : {
7649 : 24 : if (dump_enabled_p ())
7650 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7651 : : "conversion in the reduction chain.\n");
7652 : 24 : return false;
7653 : : }
7654 : : }
7655 : : else
7656 : : {
7657 : : /* First non-conversion stmt. */
7658 : 60190 : if (!stmt_info)
7659 : 53985 : stmt_info = vdef;
7660 : :
7661 : 60190 : if (lane_reducing_op_p (op.code))
7662 : : {
7663 : 624 : enum vect_def_type dt;
7664 : 624 : tree vectype_op;
7665 : :
7666 : : /* The last operand of lane-reducing operation is for
7667 : : reduction. */
7668 : 624 : gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7669 : :
7670 : 624 : if (!vect_is_simple_use (op.ops[0], loop_vinfo, &dt, &vectype_op))
7671 : 0 : return false;
7672 : :
7673 : 624 : tree type_op = TREE_TYPE (op.ops[0]);
7674 : :
7675 : 624 : if (!vectype_op)
7676 : : {
7677 : 11 : vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7678 : : type_op);
7679 : 11 : if (!vectype_op)
7680 : : return false;
7681 : : }
7682 : :
7683 : : /* For lane-reducing operation vectorizable analysis needs the
7684 : : reduction PHI information. */
7685 : 624 : STMT_VINFO_REDUC_DEF (def) = phi_info;
7686 : :
7687 : : /* Each lane-reducing operation has its own input vectype, while
7688 : : reduction PHI will record the input vectype with the least
7689 : : lanes. */
7690 : 624 : STMT_VINFO_REDUC_VECTYPE_IN (vdef) = vectype_op;
7691 : :
7692 : : /* To accommodate lane-reducing operations of mixed input
7693 : : vectypes, choose input vectype with the least lanes for the
7694 : : reduction PHI statement, which would result in the most
7695 : : ncopies for vectorized reduction results. */
7696 : 624 : if (!vectype_in
7697 : 624 : || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7698 : 470 : < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7699 : 389 : vectype_in = vectype_op;
7700 : : }
7701 : : else
7702 : 59566 : vectype_in = STMT_VINFO_VECTYPE (phi_info);
7703 : : }
7704 : :
7705 : 64488 : reduc_def = op.ops[reduc_idx];
7706 : 64488 : reduc_chain_length++;
7707 : 64488 : if (!stmt_info)
7708 : 2155 : slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7709 : : }
7710 : : /* PHIs should not participate in patterns. */
7711 : 53961 : gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7712 : :
7713 : 53961 : if (nested_in_vect_loop_p (loop, stmt_info))
7714 : : {
7715 : 53961 : loop = loop->inner;
7716 : 53961 : nested_cycle = true;
7717 : : }
7718 : :
7719 : : /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7720 : : element. */
7721 : 53961 : if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7722 : : {
7723 : 323 : gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7724 : : stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7725 : : }
7726 : 53961 : if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7727 : 323 : gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7728 : :
7729 : : /* 1. Is vectorizable reduction? */
7730 : : /* Not supportable if the reduction variable is used in the loop, unless
7731 : : it's a reduction chain. */
7732 : 53961 : if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7733 : 53961 : && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7734 : : return false;
7735 : :
7736 : : /* Reductions that are not used even in an enclosing outer-loop,
7737 : : are expected to be "live" (used out of the loop). */
7738 : 53961 : if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7739 : 0 : && !STMT_VINFO_LIVE_P (stmt_info))
7740 : : return false;
7741 : :
7742 : : /* 2. Has this been recognized as a reduction pattern?
7743 : :
7744 : : Check if STMT represents a pattern that has been recognized
7745 : : in earlier analysis stages. For stmts that represent a pattern,
7746 : : the STMT_VINFO_RELATED_STMT field records the last stmt in
7747 : : the original sequence that constitutes the pattern. */
7748 : :
7749 : 53961 : stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7750 : 53961 : if (orig_stmt_info)
7751 : : {
7752 : 3042 : gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7753 : 3042 : gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7754 : : }
7755 : :
7756 : : /* 3. Check the operands of the operation. The first operands are defined
7757 : : inside the loop body. The last operand is the reduction variable,
7758 : : which is defined by the loop-header-phi. */
7759 : :
7760 : 53961 : tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7761 : 53961 : STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7762 : 53961 : STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7763 : :
7764 : 53961 : gimple_match_op op;
7765 : 53961 : if (!gimple_extract_op (stmt_info->stmt, &op))
7766 : 0 : gcc_unreachable ();
7767 : 53961 : bool lane_reducing = lane_reducing_op_p (op.code);
7768 : :
7769 : 53961 : if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7770 : 17452 : && !SCALAR_FLOAT_TYPE_P (op.type))
7771 : : return false;
7772 : :
7773 : : /* Do not try to vectorize bit-precision reductions. */
7774 : 53961 : if (!type_has_mode_precision_p (op.type))
7775 : : return false;
7776 : :
7777 : : /* Lane-reducing ops also never can be used in a SLP reduction group
7778 : : since we'll mix lanes belonging to different reductions. But it's
7779 : : OK to use them in a reduction chain or when the reduction group
7780 : : has just one element. */
7781 : 52410 : if (lane_reducing
7782 : 389 : && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7783 : 52775 : && SLP_TREE_LANES (slp_node) > 1)
7784 : : {
7785 : 0 : if (dump_enabled_p ())
7786 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7787 : : "lane-reducing reduction in reduction group.\n");
7788 : 0 : return false;
7789 : : }
7790 : :
7791 : : /* All uses but the last are expected to be defined in the loop.
7792 : : The last use is the reduction variable. In case of nested cycle this
7793 : : assumption is not true: we use reduc_index to record the index of the
7794 : : reduction variable. */
7795 : 52410 : slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7796 : 52410 : tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7797 : : /* We need to skip an extra operand for COND_EXPRs with embedded
7798 : : comparison. */
7799 : 52410 : unsigned opno_adjust = 0;
7800 : 52410 : if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7801 : 52410 : opno_adjust = 1;
7802 : 164859 : for (i = 0; i < (int) op.num_ops; i++)
7803 : : {
7804 : : /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7805 : 112503 : if (i == 0 && op.code == COND_EXPR)
7806 : 56407 : continue;
7807 : :
7808 : 111767 : stmt_vec_info def_stmt_info;
7809 : 111767 : enum vect_def_type dt;
7810 : 111767 : if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7811 : 111767 : i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7812 : 111767 : &vectype_op[i], &def_stmt_info))
7813 : : {
7814 : 0 : if (dump_enabled_p ())
7815 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7816 : : "use not simple.\n");
7817 : 54 : return false;
7818 : : }
7819 : :
7820 : : /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7821 : : reduction operand twice (once as definition, once as else). */
7822 : 111767 : if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7823 : 55671 : continue;
7824 : :
7825 : : /* There should be only one cycle def in the stmt, the one
7826 : : leading to reduc_def. */
7827 : 56096 : if (VECTORIZABLE_CYCLE_DEF (dt))
7828 : : return false;
7829 : :
7830 : 56042 : if (!vectype_op[i])
7831 : 3883 : vectype_op[i]
7832 : 3883 : = get_vectype_for_scalar_type (loop_vinfo,
7833 : 3883 : TREE_TYPE (op.ops[i]), slp_op[i]);
7834 : :
7835 : : /* Record how the non-reduction-def value of COND_EXPR is defined.
7836 : : ??? For a chain of multiple CONDs we'd have to match them up all. */
7837 : 56042 : if (op.code == COND_EXPR && reduc_chain_length == 1)
7838 : : {
7839 : 701 : if (dt == vect_constant_def)
7840 : : {
7841 : 51 : cond_reduc_dt = dt;
7842 : 51 : cond_reduc_val = op.ops[i];
7843 : : }
7844 : 650 : else if (dt == vect_induction_def
7845 : 379 : && def_stmt_info
7846 : 1029 : && is_nonwrapping_integer_induction (def_stmt_info, loop))
7847 : : {
7848 : 114 : cond_reduc_dt = dt;
7849 : 114 : cond_stmt_vinfo = def_stmt_info;
7850 : : }
7851 : : }
7852 : : }
7853 : :
7854 : 52356 : enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
7855 : 52356 : STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
7856 : : /* If we have a condition reduction, see if we can simplify it further. */
7857 : 52356 : if (reduction_type == COND_REDUCTION)
7858 : : {
7859 : 706 : if (SLP_TREE_LANES (slp_node) != 1)
7860 : : return false;
7861 : :
7862 : : /* When the condition uses the reduction value in the condition, fail. */
7863 : 706 : if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7864 : : {
7865 : 0 : if (dump_enabled_p ())
7866 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7867 : : "condition depends on previous iteration\n");
7868 : 0 : return false;
7869 : : }
7870 : :
7871 : 706 : if (reduc_chain_length == 1
7872 : 706 : && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7873 : : OPTIMIZE_FOR_SPEED)
7874 : 671 : || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7875 : : vectype_in,
7876 : : OPTIMIZE_FOR_SPEED)))
7877 : : {
7878 : 0 : if (dump_enabled_p ())
7879 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7880 : : "optimizing condition reduction with"
7881 : : " FOLD_EXTRACT_LAST.\n");
7882 : 0 : STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7883 : : }
7884 : 706 : else if (cond_reduc_dt == vect_induction_def)
7885 : : {
7886 : 101 : tree base
7887 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7888 : 101 : tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7889 : :
7890 : 101 : gcc_assert (TREE_CODE (base) == INTEGER_CST
7891 : : && TREE_CODE (step) == INTEGER_CST);
7892 : 101 : cond_reduc_val = NULL_TREE;
7893 : 101 : enum tree_code cond_reduc_op_code = ERROR_MARK;
7894 : 101 : tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7895 : 101 : if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7896 : : ;
7897 : : /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7898 : : above base; punt if base is the minimum value of the type for
7899 : : MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7900 : 89 : else if (tree_int_cst_sgn (step) == -1)
7901 : : {
7902 : 20 : cond_reduc_op_code = MIN_EXPR;
7903 : 20 : if (tree_int_cst_sgn (base) == -1)
7904 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7905 : 20 : else if (tree_int_cst_lt (base,
7906 : 20 : TYPE_MAX_VALUE (TREE_TYPE (base))))
7907 : 20 : cond_reduc_val
7908 : 20 : = int_const_binop (PLUS_EXPR, base, integer_one_node);
7909 : : }
7910 : : else
7911 : : {
7912 : 69 : cond_reduc_op_code = MAX_EXPR;
7913 : 69 : if (tree_int_cst_sgn (base) == 1)
7914 : 0 : cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7915 : 69 : else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7916 : : base))
7917 : 69 : cond_reduc_val
7918 : 69 : = int_const_binop (MINUS_EXPR, base, integer_one_node);
7919 : : }
7920 : 89 : if (cond_reduc_val)
7921 : : {
7922 : 89 : if (dump_enabled_p ())
7923 : 64 : dump_printf_loc (MSG_NOTE, vect_location,
7924 : : "condition expression based on "
7925 : : "integer induction.\n");
7926 : 89 : STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7927 : 89 : STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7928 : 89 : = cond_reduc_val;
7929 : 89 : STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7930 : : }
7931 : : }
7932 : 605 : else if (cond_reduc_dt == vect_constant_def)
7933 : : {
7934 : 46 : enum vect_def_type cond_initial_dt;
7935 : 46 : tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7936 : 46 : vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7937 : 46 : if (cond_initial_dt == vect_constant_def
7938 : 65 : && types_compatible_p (TREE_TYPE (cond_initial_val),
7939 : 19 : TREE_TYPE (cond_reduc_val)))
7940 : : {
7941 : 19 : tree e = fold_binary (LE_EXPR, boolean_type_node,
7942 : : cond_initial_val, cond_reduc_val);
7943 : 19 : if (e && (integer_onep (e) || integer_zerop (e)))
7944 : : {
7945 : 19 : if (dump_enabled_p ())
7946 : 14 : dump_printf_loc (MSG_NOTE, vect_location,
7947 : : "condition expression based on "
7948 : : "compile time constant.\n");
7949 : : /* Record reduction code at analysis stage. */
7950 : 19 : STMT_VINFO_REDUC_CODE (reduc_info)
7951 : 19 : = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7952 : 19 : STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7953 : : }
7954 : : }
7955 : : }
7956 : : }
7957 : :
7958 : 52356 : if (STMT_VINFO_LIVE_P (phi_info))
7959 : : return false;
7960 : :
7961 : 52356 : ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7962 : :
7963 : 52356 : gcc_assert (ncopies >= 1);
7964 : :
7965 : 52356 : poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7966 : :
7967 : 52356 : if (nested_cycle)
7968 : : {
7969 : 241 : gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7970 : : == vect_double_reduction_def);
7971 : : double_reduc = true;
7972 : : }
7973 : :
7974 : : /* 4.2. Check support for the epilog operation.
7975 : :
7976 : : If STMT represents a reduction pattern, then the type of the
7977 : : reduction variable may be different than the type of the rest
7978 : : of the arguments. For example, consider the case of accumulation
7979 : : of shorts into an int accumulator; The original code:
7980 : : S1: int_a = (int) short_a;
7981 : : orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7982 : :
7983 : : was replaced with:
7984 : : STMT: int_acc = widen_sum <short_a, int_acc>
7985 : :
7986 : : This means that:
7987 : : 1. The tree-code that is used to create the vector operation in the
7988 : : epilog code (that reduces the partial results) is not the
7989 : : tree-code of STMT, but is rather the tree-code of the original
7990 : : stmt from the pattern that STMT is replacing. I.e, in the example
7991 : : above we want to use 'widen_sum' in the loop, but 'plus' in the
7992 : : epilog.
7993 : : 2. The type (mode) we use to check available target support
7994 : : for the vector operation to be created in the *epilog*, is
7995 : : determined by the type of the reduction variable (in the example
7996 : : above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7997 : : However the type (mode) we use to check available target support
7998 : : for the vector operation to be created *inside the loop*, is
7999 : : determined by the type of the other arguments to STMT (in the
8000 : : example we'd check this: optab_handler (widen_sum_optab,
8001 : : vect_short_mode)).
8002 : :
8003 : : This is contrary to "regular" reductions, in which the types of all
8004 : : the arguments are the same as the type of the reduction variable.
8005 : : For "regular" reductions we can therefore use the same vector type
8006 : : (and also the same tree-code) when generating the epilog code and
8007 : : when generating the code inside the loop. */
8008 : :
8009 : 52356 : code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
8010 : :
8011 : : /* If conversion might have created a conditional operation like
8012 : : IFN_COND_ADD already. Use the internal code for the following checks. */
8013 : 52356 : if (orig_code.is_internal_fn ())
8014 : : {
8015 : 2643 : tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
8016 : 2643 : orig_code = new_code != ERROR_MARK ? new_code : orig_code;
8017 : : }
8018 : :
8019 : 52356 : STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
8020 : :
8021 : 52356 : reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8022 : 52356 : if (reduction_type == TREE_CODE_REDUCTION)
8023 : : {
8024 : : /* Check whether it's ok to change the order of the computation.
8025 : : Generally, when vectorizing a reduction we change the order of the
8026 : : computation. This may change the behavior of the program in some
8027 : : cases, so we need to check that this is ok. One exception is when
8028 : : vectorizing an outer-loop: the inner-loop is executed sequentially,
8029 : : and therefore vectorizing reductions in the inner-loop during
8030 : : outer-loop vectorization is safe. Likewise when we are vectorizing
8031 : : a series of reductions using SLP and the VF is one the reductions
8032 : : are performed in scalar order. */
8033 : 49762 : if (!REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8034 : 49762 : && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
8035 : : ;
8036 : 49654 : else if (needs_fold_left_reduction_p (op.type, orig_code))
8037 : : {
8038 : : /* When vectorizing a reduction chain w/o SLP the reduction PHI
8039 : : is not directy used in stmt. */
8040 : 3422 : if (!only_slp_reduc_chain
8041 : 3422 : && reduc_chain_length != 1)
8042 : : {
8043 : 53 : if (dump_enabled_p ())
8044 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8045 : : "in-order reduction chain without SLP.\n");
8046 : 53 : return false;
8047 : : }
8048 : 3369 : STMT_VINFO_REDUC_TYPE (reduc_info)
8049 : 3369 : = reduction_type = FOLD_LEFT_REDUCTION;
8050 : : }
8051 : 46232 : else if (!commutative_binary_op_p (orig_code, op.type)
8052 : 46232 : || !associative_binary_op_p (orig_code, op.type))
8053 : : {
8054 : 138 : if (dump_enabled_p ())
8055 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8056 : : "reduction: not commutative/associative\n");
8057 : 138 : return false;
8058 : : }
8059 : : }
8060 : :
8061 : 3369 : if ((reduction_type == COND_REDUCTION
8062 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
8063 : : || reduction_type == CONST_COND_REDUCTION
8064 : 48796 : || reduction_type == EXTRACT_LAST_REDUCTION)
8065 : : && 1
8066 : 724 : && ncopies > 1)
8067 : : {
8068 : 292 : if (dump_enabled_p ())
8069 : 84 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8070 : : "multiple types in condition reduction.\n");
8071 : 292 : return false;
8072 : : }
8073 : :
8074 : 51873 : internal_fn reduc_fn = IFN_LAST;
8075 : 51873 : if (reduction_type == TREE_CODE_REDUCTION
8076 : 51873 : || reduction_type == FOLD_LEFT_REDUCTION
8077 : : || reduction_type == INTEGER_INDUC_COND_REDUCTION
8078 : 432 : || reduction_type == CONST_COND_REDUCTION)
8079 : : {
8080 : 46320 : if (reduction_type == FOLD_LEFT_REDUCTION
8081 : 56002 : ? fold_left_reduction_fn (orig_code, &reduc_fn)
8082 : 46320 : : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8083 : : {
8084 : 50881 : if (reduc_fn != IFN_LAST
8085 : 50881 : && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8086 : : OPTIMIZE_FOR_SPEED))
8087 : : {
8088 : 11143 : if (dump_enabled_p ())
8089 : 802 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8090 : : "reduc op not supported by target.\n");
8091 : :
8092 : 11143 : reduc_fn = IFN_LAST;
8093 : : }
8094 : : }
8095 : : else
8096 : : {
8097 : 678 : if (!nested_cycle || double_reduc)
8098 : : {
8099 : 678 : if (dump_enabled_p ())
8100 : 48 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8101 : : "no reduc code for scalar code.\n");
8102 : :
8103 : 678 : return false;
8104 : : }
8105 : : }
8106 : : }
8107 : 314 : else if (reduction_type == COND_REDUCTION)
8108 : : {
8109 : 314 : int scalar_precision
8110 : 314 : = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8111 : 314 : cr_index_scalar_type = make_unsigned_type (scalar_precision);
8112 : 314 : cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8113 : : vectype_out);
8114 : :
8115 : 314 : if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8116 : : OPTIMIZE_FOR_SPEED))
8117 : 9 : reduc_fn = IFN_REDUC_MAX;
8118 : : }
8119 : 51195 : STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8120 : :
8121 : 51195 : if (reduction_type != EXTRACT_LAST_REDUCTION
8122 : : && (!nested_cycle || double_reduc)
8123 : : && reduc_fn == IFN_LAST
8124 : : && !nunits_out.is_constant ())
8125 : : {
8126 : : if (dump_enabled_p ())
8127 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8128 : : "missing target support for reduction on"
8129 : : " variable-length vectors.\n");
8130 : : return false;
8131 : : }
8132 : :
8133 : : /* For SLP reductions, see if there is a neutral value we can use. */
8134 : 51195 : tree neutral_op = NULL_TREE;
8135 : 51195 : tree initial_value = NULL_TREE;
8136 : 51195 : if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8137 : 323 : initial_value = vect_phi_initial_value (reduc_def_phi);
8138 : 51195 : neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8139 : : orig_code, initial_value);
8140 : :
8141 : 51195 : if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8142 : : {
8143 : : /* We can't support in-order reductions of code such as this:
8144 : :
8145 : : for (int i = 0; i < n1; ++i)
8146 : : for (int j = 0; j < n2; ++j)
8147 : : l += a[j];
8148 : :
8149 : : since GCC effectively transforms the loop when vectorizing:
8150 : :
8151 : : for (int i = 0; i < n1 / VF; ++i)
8152 : : for (int j = 0; j < n2; ++j)
8153 : : for (int k = 0; k < VF; ++k)
8154 : : l += a[j];
8155 : :
8156 : : which is a reassociation of the original operation. */
8157 : 56 : if (dump_enabled_p ())
8158 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8159 : : "in-order double reduction not supported.\n");
8160 : :
8161 : 56 : return false;
8162 : : }
8163 : :
8164 : 51139 : if (reduction_type == FOLD_LEFT_REDUCTION
8165 : 4505 : && SLP_TREE_LANES (slp_node) > 1
8166 : 51255 : && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8167 : : {
8168 : : /* We cannot use in-order reductions in this case because there is
8169 : : an implicit reassociation of the operations involved. */
8170 : 43 : if (dump_enabled_p ())
8171 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8172 : : "in-order unchained SLP reductions not supported.\n");
8173 : 43 : return false;
8174 : : }
8175 : :
8176 : : /* For double reductions, and for SLP reductions with a neutral value,
8177 : : we construct a variable-length initial vector by loading a vector
8178 : : full of the neutral value and then shift-and-inserting the start
8179 : : values into the low-numbered elements. */
8180 : 51096 : if ((double_reduc || neutral_op)
8181 : : && !nunits_out.is_constant ()
8182 : : && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8183 : : vectype_out, OPTIMIZE_FOR_SPEED))
8184 : : {
8185 : : if (dump_enabled_p ())
8186 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8187 : : "reduction on variable-length vectors requires"
8188 : : " target support for a vector-shift-and-insert"
8189 : : " operation.\n");
8190 : : return false;
8191 : : }
8192 : :
8193 : : /* Check extra constraints for variable-length unchained SLP reductions. */
8194 : 51096 : if (!REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8195 : : && !nunits_out.is_constant ())
8196 : : {
8197 : : /* We checked above that we could build the initial vector when
8198 : : there's a neutral element value. Check here for the case in
8199 : : which each SLP statement has its own initial value and in which
8200 : : that value needs to be repeated for every instance of the
8201 : : statement within the initial vector. */
8202 : : unsigned int group_size = SLP_TREE_LANES (slp_node);
8203 : : if (!neutral_op
8204 : : && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8205 : : TREE_TYPE (vectype_out)))
8206 : : {
8207 : : if (dump_enabled_p ())
8208 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8209 : : "unsupported form of SLP reduction for"
8210 : : " variable-length vectors: cannot build"
8211 : : " initial vector.\n");
8212 : : return false;
8213 : : }
8214 : : /* The epilogue code relies on the number of elements being a multiple
8215 : : of the group size. The duplicate-and-interleave approach to setting
8216 : : up the initial vector does too. */
8217 : : if (!multiple_p (nunits_out, group_size))
8218 : : {
8219 : : if (dump_enabled_p ())
8220 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8221 : : "unsupported form of SLP reduction for"
8222 : : " variable-length vectors: the vector size"
8223 : : " is not a multiple of the number of results.\n");
8224 : : return false;
8225 : : }
8226 : : }
8227 : :
8228 : 51096 : if (reduction_type == COND_REDUCTION)
8229 : : {
8230 : 314 : widest_int ni;
8231 : :
8232 : 314 : if (! max_loop_iterations (loop, &ni))
8233 : : {
8234 : 0 : if (dump_enabled_p ())
8235 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
8236 : : "loop count not known, cannot create cond "
8237 : : "reduction.\n");
8238 : 0 : return false;
8239 : : }
8240 : : /* Convert backedges to iterations. */
8241 : 314 : ni += 1;
8242 : :
8243 : : /* The additional index will be the same type as the condition. Check
8244 : : that the loop can fit into this less one (because we'll use up the
8245 : : zero slot for when there are no matches). */
8246 : 314 : tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8247 : 314 : if (wi::geu_p (ni, wi::to_widest (max_index)))
8248 : : {
8249 : 90 : if (dump_enabled_p ())
8250 : 54 : dump_printf_loc (MSG_NOTE, vect_location,
8251 : : "loop size is greater than data size.\n");
8252 : 90 : return false;
8253 : : }
8254 : 314 : }
8255 : :
8256 : : /* In case the vectorization factor (VF) is bigger than the number
8257 : : of elements that we can fit in a vectype (nunits), we have to generate
8258 : : more than one vector stmt - i.e - we need to "unroll" the
8259 : : vector stmt by a factor VF/nunits. For more details see documentation
8260 : : in vectorizable_operation. */
8261 : :
8262 : : /* If the reduction is used in an outer loop we need to generate
8263 : : VF intermediate results, like so (e.g. for ncopies=2):
8264 : : r0 = phi (init, r0)
8265 : : r1 = phi (init, r1)
8266 : : r0 = x0 + r0;
8267 : : r1 = x1 + r1;
8268 : : (i.e. we generate VF results in 2 registers).
8269 : : In this case we have a separate def-use cycle for each copy, and therefore
8270 : : for each copy we get the vector def for the reduction variable from the
8271 : : respective phi node created for this copy.
8272 : :
8273 : : Otherwise (the reduction is unused in the loop nest), we can combine
8274 : : together intermediate results, like so (e.g. for ncopies=2):
8275 : : r = phi (init, r)
8276 : : r = x0 + r;
8277 : : r = x1 + r;
8278 : : (i.e. we generate VF/2 results in a single register).
8279 : : In this case for each copy we get the vector def for the reduction variable
8280 : : from the vectorized reduction operation generated in the previous iteration.
8281 : :
8282 : : This only works when we see both the reduction PHI and its only consumer
8283 : : in vectorizable_reduction and there are no intermediate stmts
8284 : : participating. When unrolling we want each unrolled iteration to have its
8285 : : own reduction accumulator since one of the main goals of unrolling a
8286 : : reduction is to reduce the aggregate loop-carried latency. */
8287 : 51006 : if (ncopies > 1
8288 : 6818 : && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8289 : 6596 : && SLP_TREE_LANES (slp_node) == 1
8290 : 6513 : && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8291 : 6476 : && reduc_chain_length == 1
8292 : 57293 : && loop_vinfo->suggested_unroll_factor == 1)
8293 : 51006 : single_defuse_cycle = true;
8294 : :
8295 : 51006 : if (single_defuse_cycle && !lane_reducing)
8296 : : {
8297 : 5924 : gcc_assert (op.code != COND_EXPR);
8298 : :
8299 : : /* 4. check support for the operation in the loop
8300 : :
8301 : : This isn't necessary for the lane reduction codes, since they
8302 : : can only be produced by pattern matching, and it's up to the
8303 : : pattern matcher to test for support. The main reason for
8304 : : specifically skipping this step is to avoid rechecking whether
8305 : : mixed-sign dot-products can be implemented using signed
8306 : : dot-products. */
8307 : 5924 : machine_mode vec_mode = TYPE_MODE (vectype_in);
8308 : 5924 : if (!directly_supported_p (op.code, vectype_in, optab_vector))
8309 : : {
8310 : 2432 : if (dump_enabled_p ())
8311 : 18 : dump_printf (MSG_NOTE, "op not supported by target.\n");
8312 : 4928 : if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8313 : 2432 : || !vect_can_vectorize_without_simd_p (op.code))
8314 : : single_defuse_cycle = false;
8315 : : else
8316 : 1281 : if (dump_enabled_p ())
8317 : 8 : dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8318 : : }
8319 : :
8320 : 5924 : if (vect_emulated_vector_p (vectype_in)
8321 : 5924 : && !vect_can_vectorize_without_simd_p (op.code))
8322 : : {
8323 : 0 : if (dump_enabled_p ())
8324 : 0 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
8325 : 0 : return false;
8326 : : }
8327 : : }
8328 : 51006 : if (dump_enabled_p () && single_defuse_cycle)
8329 : 812 : dump_printf_loc (MSG_NOTE, vect_location,
8330 : : "using single def-use cycle for reduction by reducing "
8331 : : "multiple vectors to one in the loop body\n");
8332 : 51006 : STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8333 : :
8334 : : /* For lane-reducing operation, the below processing related to single
8335 : : defuse-cycle will be done in its own vectorizable function. One more
8336 : : thing to note is that the operation must not be involved in fold-left
8337 : : reduction. */
8338 : 51006 : single_defuse_cycle &= !lane_reducing;
8339 : :
8340 : 51006 : if (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION)
8341 : 28822 : for (i = 0; i < (int) op.num_ops; i++)
8342 : 19654 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8343 : : {
8344 : 0 : if (dump_enabled_p ())
8345 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8346 : : "incompatible vector types for invariants\n");
8347 : 0 : return false;
8348 : : }
8349 : :
8350 : 51006 : vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8351 : : reduction_type, ncopies, cost_vec);
8352 : : /* Cost the reduction op inside the loop if transformed via
8353 : : vect_transform_reduction for non-lane-reducing operation. Otherwise
8354 : : this is costed by the separate vectorizable_* routines. */
8355 : 51006 : if (single_defuse_cycle)
8356 : 4773 : record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
8357 : :
8358 : 51006 : if (dump_enabled_p ()
8359 : 51006 : && reduction_type == FOLD_LEFT_REDUCTION)
8360 : 240 : dump_printf_loc (MSG_NOTE, vect_location,
8361 : : "using an in-order (fold-left) reduction.\n");
8362 : 51006 : STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8363 : :
8364 : : /* All but single defuse-cycle optimized and fold-left reductions go
8365 : : through their own vectorizable_* routines. */
8366 : 51006 : if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
8367 : : {
8368 : 41838 : stmt_vec_info tem
8369 : 41838 : = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8370 : 41838 : if (REDUC_GROUP_FIRST_ELEMENT (tem))
8371 : : {
8372 : 250 : gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8373 : : tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8374 : : }
8375 : 41838 : STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8376 : 41838 : STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8377 : : }
8378 : 9168 : else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8379 : 4 : vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
8380 : : slp_node, op.code, op.type,
8381 : : vectype_in);
8382 : : return true;
8383 : : }
8384 : :
8385 : : /* STMT_INFO is a dot-product reduction whose multiplication operands
8386 : : have different signs. Emit a sequence to emulate the operation
8387 : : using a series of signed DOT_PROD_EXPRs and return the last
8388 : : statement generated. VEC_DEST is the result of the vector operation
8389 : : and VOP lists its inputs. */
8390 : :
8391 : : static gassign *
8392 : 2 : vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8393 : : gimple_stmt_iterator *gsi, tree vec_dest,
8394 : : tree vop[3])
8395 : : {
8396 : 2 : tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8397 : 2 : tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8398 : 2 : tree narrow_elttype = TREE_TYPE (narrow_vectype);
8399 : 2 : gimple *new_stmt;
8400 : :
8401 : : /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8402 : 2 : if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8403 : 0 : std::swap (vop[0], vop[1]);
8404 : :
8405 : : /* Convert all inputs to signed types. */
8406 : 8 : for (int i = 0; i < 3; ++i)
8407 : 6 : if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8408 : : {
8409 : 2 : tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8410 : 2 : new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8411 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8412 : 2 : vop[i] = tmp;
8413 : : }
8414 : :
8415 : : /* In the comments below we assume 8-bit inputs for simplicity,
8416 : : but the approach works for any full integer type. */
8417 : :
8418 : : /* Create a vector of -128. */
8419 : 2 : tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8420 : 2 : tree min_narrow = build_vector_from_val (narrow_vectype,
8421 : : min_narrow_elttype);
8422 : :
8423 : : /* Create a vector of 64. */
8424 : 2 : auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8425 : 2 : tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8426 : 2 : half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8427 : :
8428 : : /* Emit: SUB_RES = VOP[0] - 128. */
8429 : 2 : tree sub_res = make_ssa_name (narrow_vectype);
8430 : 2 : new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8431 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8432 : :
8433 : : /* Emit:
8434 : :
8435 : : STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8436 : : STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8437 : : STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8438 : :
8439 : : on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8440 : : Doing the two 64 * y steps first allows more time to compute x. */
8441 : 2 : tree stage1 = make_ssa_name (wide_vectype);
8442 : 2 : new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8443 : : vop[1], half_narrow, vop[2]);
8444 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8445 : :
8446 : 2 : tree stage2 = make_ssa_name (wide_vectype);
8447 : 2 : new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8448 : : vop[1], half_narrow, stage1);
8449 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8450 : :
8451 : 2 : tree stage3 = make_ssa_name (wide_vectype);
8452 : 2 : new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8453 : : sub_res, vop[1], stage2);
8454 : 2 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8455 : :
8456 : : /* Convert STAGE3 to the reduction type. */
8457 : 2 : return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8458 : 2 : }
8459 : :
8460 : : /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8461 : : value. */
8462 : :
8463 : : bool
8464 : 2297 : vect_transform_reduction (loop_vec_info loop_vinfo,
8465 : : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8466 : : slp_tree slp_node)
8467 : : {
8468 : 2297 : tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8469 : 2297 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8470 : 2297 : unsigned vec_num;
8471 : :
8472 : 2297 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8473 : 2297 : gcc_assert (reduc_info->is_reduc_info);
8474 : :
8475 : 2297 : if (nested_in_vect_loop_p (loop, stmt_info))
8476 : : {
8477 : 0 : loop = loop->inner;
8478 : 0 : gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8479 : : }
8480 : :
8481 : 2297 : gimple_match_op op;
8482 : 2297 : if (!gimple_extract_op (stmt_info->stmt, &op))
8483 : 0 : gcc_unreachable ();
8484 : :
8485 : : /* All uses but the last are expected to be defined in the loop.
8486 : : The last use is the reduction variable. In case of nested cycle this
8487 : : assumption is not true: we use reduc_index to record the index of the
8488 : : reduction variable. */
8489 : 2297 : stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8490 : 2297 : gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8491 : 2297 : int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8492 : 2297 : tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
8493 : :
8494 : 2297 : if (!vectype_in)
8495 : 2060 : vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8496 : :
8497 : 2297 : vec_num = vect_get_num_copies (loop_vinfo, slp_node, vectype_in);
8498 : :
8499 : 2297 : code_helper code = canonicalize_code (op.code, op.type);
8500 : 2297 : internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8501 : :
8502 : 2297 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8503 : 2297 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8504 : 2297 : bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8505 : :
8506 : : /* Transform. */
8507 : 2297 : tree new_temp = NULL_TREE;
8508 : 16079 : auto_vec<tree> vec_oprnds[3];
8509 : :
8510 : 2297 : if (dump_enabled_p ())
8511 : 685 : dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8512 : :
8513 : : /* A binary COND_OP reduction must have the same definition and else
8514 : : value. */
8515 : 2572 : bool cond_fn_p = code.is_internal_fn ()
8516 : 275 : && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8517 : 275 : if (cond_fn_p)
8518 : : {
8519 : 275 : gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8520 : : || code == IFN_COND_MUL || code == IFN_COND_AND
8521 : : || code == IFN_COND_IOR || code == IFN_COND_XOR
8522 : : || code == IFN_COND_MIN || code == IFN_COND_MAX);
8523 : 275 : gcc_assert (op.num_ops == 4
8524 : : && (op.ops[reduc_index]
8525 : : == op.ops[internal_fn_else_index ((internal_fn) code)]));
8526 : : }
8527 : :
8528 : 2297 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8529 : :
8530 : 2297 : vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8531 : 2297 : if (reduction_type == FOLD_LEFT_REDUCTION)
8532 : : {
8533 : 834 : internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8534 : 834 : gcc_assert (code.is_tree_code () || cond_fn_p);
8535 : 834 : return vectorize_fold_left_reduction
8536 : 834 : (loop_vinfo, stmt_info, gsi, slp_node, reduc_def_phi,
8537 : 834 : code, reduc_fn, op.num_ops, vectype_in,
8538 : 834 : reduc_index, masks, lens);
8539 : : }
8540 : :
8541 : 1463 : bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8542 : 1463 : bool lane_reducing = lane_reducing_op_p (code);
8543 : 1226 : gcc_assert (single_defuse_cycle || lane_reducing);
8544 : :
8545 : 1463 : if (lane_reducing)
8546 : : {
8547 : : /* The last operand of lane-reducing op is for reduction. */
8548 : 237 : gcc_assert (reduc_index == (int) op.num_ops - 1);
8549 : : }
8550 : :
8551 : : /* Create the destination vector */
8552 : 1463 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8553 : 1463 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8554 : :
8555 : : /* Get NCOPIES vector definitions for all operands except the reduction
8556 : : definition. */
8557 : 1463 : if (!cond_fn_p)
8558 : : {
8559 : 1195 : gcc_assert (reduc_index >= 0 && reduc_index <= 2);
8560 : 1973 : vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
8561 : 1195 : single_defuse_cycle && reduc_index == 0
8562 : : ? NULL_TREE : op.ops[0], &vec_oprnds[0],
8563 : 1195 : single_defuse_cycle && reduc_index == 1
8564 : : ? NULL_TREE : op.ops[1], &vec_oprnds[1],
8565 : 1195 : op.num_ops == 3
8566 : 237 : && !(single_defuse_cycle && reduc_index == 2)
8567 : : ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
8568 : : }
8569 : : else
8570 : : {
8571 : : /* For a conditional operation pass the truth type as mask
8572 : : vectype. */
8573 : 268 : gcc_assert (single_defuse_cycle
8574 : : && (reduc_index == 1 || reduc_index == 2));
8575 : 268 : vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1, op.ops[0],
8576 : : truth_type_for (vectype_in), &vec_oprnds[0],
8577 : : reduc_index == 1 ? NULL_TREE : op.ops[1],
8578 : : NULL_TREE, &vec_oprnds[1],
8579 : : reduc_index == 2 ? NULL_TREE : op.ops[2],
8580 : : NULL_TREE, &vec_oprnds[2]);
8581 : : }
8582 : :
8583 : : /* For single def-use cycles get one copy of the vectorized reduction
8584 : : definition. */
8585 : 1463 : if (single_defuse_cycle)
8586 : : {
8587 : 1415 : vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
8588 : : reduc_index == 0 ? op.ops[0] : NULL_TREE,
8589 : : &vec_oprnds[0],
8590 : : reduc_index == 1 ? op.ops[1] : NULL_TREE,
8591 : : &vec_oprnds[1],
8592 : : reduc_index == 2 ? op.ops[2] : NULL_TREE,
8593 : : &vec_oprnds[2]);
8594 : : }
8595 : 48 : else if (lane_reducing)
8596 : : {
8597 : : /* For normal reduction, consistency between vectorized def/use is
8598 : : naturally ensured when mapping from scalar statement. But if lane-
8599 : : reducing op is involved in reduction, thing would become somewhat
8600 : : complicated in that the op's result and operand for accumulation are
8601 : : limited to less lanes than other operands, which certainly causes
8602 : : def/use mismatch on adjacent statements around the op if do not have
8603 : : any kind of specific adjustment. One approach is to refit lane-
8604 : : reducing op in the way of introducing new trivial pass-through copies
8605 : : to fix possible def/use gap, so as to make it behave like a normal op.
8606 : : And vector reduction PHIs are always generated to the full extent, no
8607 : : matter lane-reducing op exists or not. If some copies or PHIs are
8608 : : actually superfluous, they would be cleaned up by passes after
8609 : : vectorization. An example for single-lane slp, lane-reducing ops
8610 : : with mixed input vectypes in a reduction chain, is given as below.
8611 : : Similarly, this handling is applicable for multiple-lane slp as well.
8612 : :
8613 : : int sum = 1;
8614 : : for (i)
8615 : : {
8616 : : sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
8617 : : sum += w[i]; // widen-sum <vector(16) char>
8618 : : sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8619 : : sum += n[i]; // normal <vector(4) int>
8620 : : }
8621 : :
8622 : : The vector size is 128-bit,vectorization factor is 16. Reduction
8623 : : statements would be transformed as:
8624 : :
8625 : : vector<4> int sum_v0 = { 0, 0, 0, 1 };
8626 : : vector<4> int sum_v1 = { 0, 0, 0, 0 };
8627 : : vector<4> int sum_v2 = { 0, 0, 0, 0 };
8628 : : vector<4> int sum_v3 = { 0, 0, 0, 0 };
8629 : :
8630 : : for (i / 16)
8631 : : {
8632 : : sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8633 : : sum_v1 = sum_v1; // copy
8634 : : sum_v2 = sum_v2; // copy
8635 : : sum_v3 = sum_v3; // copy
8636 : :
8637 : : sum_v0 = sum_v0; // copy
8638 : : sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8639 : : sum_v2 = sum_v2; // copy
8640 : : sum_v3 = sum_v3; // copy
8641 : :
8642 : : sum_v0 = sum_v0; // copy
8643 : : sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8644 : : sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8645 : : sum_v3 = sum_v3; // copy
8646 : :
8647 : : sum_v0 += n_v0[i: 0 ~ 3 ];
8648 : : sum_v1 += n_v1[i: 4 ~ 7 ];
8649 : : sum_v2 += n_v2[i: 8 ~ 11];
8650 : : sum_v3 += n_v3[i: 12 ~ 15];
8651 : : }
8652 : :
8653 : : Moreover, for a higher instruction parallelism in final vectorized
8654 : : loop, it is considered to make those effective vector lane-reducing
8655 : : ops be distributed evenly among all def-use cycles. In the above
8656 : : example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8657 : : cycles, instruction dependency among them could be eliminated. */
8658 : 48 : unsigned effec_ncopies = vec_oprnds[0].length ();
8659 : 48 : unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8660 : :
8661 : 48 : gcc_assert (effec_ncopies <= total_ncopies);
8662 : :
8663 : 48 : if (effec_ncopies < total_ncopies)
8664 : : {
8665 : 144 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8666 : : {
8667 : 192 : gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8668 : 96 : vec_oprnds[i].safe_grow_cleared (total_ncopies);
8669 : : }
8670 : : }
8671 : :
8672 : 48 : tree reduc_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8673 : 48 : gcc_assert (reduc_vectype_in);
8674 : :
8675 : 48 : unsigned effec_reduc_ncopies
8676 : 48 : = vect_get_num_copies (loop_vinfo, slp_node, reduc_vectype_in);
8677 : :
8678 : 48 : gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8679 : :
8680 : 48 : if (effec_ncopies < effec_reduc_ncopies)
8681 : : {
8682 : : /* Find suitable def-use cycles to generate vectorized statements
8683 : : into, and reorder operands based on the selection. */
8684 : 25 : unsigned curr_pos = reduc_info->reduc_result_pos;
8685 : 25 : unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8686 : :
8687 : 25 : gcc_assert (curr_pos < effec_reduc_ncopies);
8688 : 25 : reduc_info->reduc_result_pos = next_pos;
8689 : :
8690 : 25 : if (curr_pos)
8691 : : {
8692 : 14 : unsigned count = effec_reduc_ncopies - effec_ncopies;
8693 : 14 : unsigned start = curr_pos - count;
8694 : :
8695 : 14 : if ((int) start < 0)
8696 : : {
8697 : 11 : count = curr_pos;
8698 : 11 : start = 0;
8699 : : }
8700 : :
8701 : 42 : for (unsigned i = 0; i < op.num_ops - 1; i++)
8702 : : {
8703 : 68 : for (unsigned j = effec_ncopies; j > start; j--)
8704 : : {
8705 : 40 : unsigned k = j - 1;
8706 : 40 : std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8707 : 40 : gcc_assert (!vec_oprnds[i][k]);
8708 : : }
8709 : : }
8710 : : }
8711 : : }
8712 : : }
8713 : :
8714 : 1463 : bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
8715 : 2433 : unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8716 : 1463 : unsigned mask_index = 0;
8717 : :
8718 : 6001 : for (unsigned i = 0; i < num; ++i)
8719 : : {
8720 : 4538 : gimple *new_stmt;
8721 : 4538 : tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8722 : 4538 : if (!vop[0] || !vop[1])
8723 : : {
8724 : 168 : tree reduc_vop = vec_oprnds[reduc_index][i];
8725 : :
8726 : : /* If could not generate an effective vector statement for current
8727 : : portion of reduction operand, insert a trivial copy to simply
8728 : : handle over the operand to other dependent statements. */
8729 : 168 : gcc_assert (reduc_vop);
8730 : :
8731 : 168 : if (TREE_CODE (reduc_vop) == SSA_NAME
8732 : 168 : && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8733 : 168 : new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8734 : : else
8735 : : {
8736 : 0 : new_temp = make_ssa_name (vec_dest);
8737 : 0 : new_stmt = gimple_build_assign (new_temp, reduc_vop);
8738 : 0 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8739 : : gsi);
8740 : : }
8741 : : }
8742 : 4370 : else if (masked_loop_p && !mask_by_cond_expr)
8743 : : {
8744 : : /* No conditional ifns have been defined for lane-reducing op
8745 : : yet. */
8746 : 8 : gcc_assert (!lane_reducing);
8747 : :
8748 : : /* Make sure that the reduction accumulator is vop[0]. */
8749 : 8 : if (reduc_index == 1)
8750 : : {
8751 : 8 : gcc_assert (commutative_binary_op_p (code, op.type));
8752 : 8 : std::swap (vop[0], vop[1]);
8753 : : }
8754 : 8 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8755 : : vec_num, vectype_in,
8756 : : mask_index++);
8757 : 8 : gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8758 : : vop[0], vop[1], vop[0]);
8759 : 8 : new_temp = make_ssa_name (vec_dest, call);
8760 : 8 : gimple_call_set_lhs (call, new_temp);
8761 : 8 : gimple_call_set_nothrow (call, true);
8762 : 8 : vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8763 : 8 : new_stmt = call;
8764 : : }
8765 : : else
8766 : : {
8767 : 4362 : if (op.num_ops >= 3)
8768 : 988 : vop[2] = vec_oprnds[2][i];
8769 : :
8770 : 4362 : if (masked_loop_p && mask_by_cond_expr)
8771 : : {
8772 : 4 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8773 : : vec_num, vectype_in,
8774 : : mask_index++);
8775 : 4 : build_vect_cond_expr (code, vop, mask, gsi);
8776 : : }
8777 : :
8778 : 4362 : if (emulated_mixed_dot_prod)
8779 : 2 : new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8780 : : vec_dest, vop);
8781 : :
8782 : 5074 : else if (code.is_internal_fn () && !cond_fn_p)
8783 : 0 : new_stmt = gimple_build_call_internal (internal_fn (code),
8784 : : op.num_ops,
8785 : : vop[0], vop[1], vop[2]);
8786 : 5074 : else if (code.is_internal_fn () && cond_fn_p)
8787 : 714 : new_stmt = gimple_build_call_internal (internal_fn (code),
8788 : : op.num_ops,
8789 : : vop[0], vop[1], vop[2],
8790 : : vop[reduc_index]);
8791 : : else
8792 : 3646 : new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8793 : : vop[0], vop[1], vop[2]);
8794 : 4362 : new_temp = make_ssa_name (vec_dest, new_stmt);
8795 : 4362 : gimple_set_lhs (new_stmt, new_temp);
8796 : 4362 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8797 : : }
8798 : :
8799 : 4538 : if (single_defuse_cycle && i < num - 1)
8800 : 2883 : vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8801 : : else
8802 : 1655 : slp_node->push_vec_def (new_stmt);
8803 : : }
8804 : :
8805 : : return true;
8806 : 9188 : }
8807 : :
8808 : : /* Transform phase of a cycle PHI. */
8809 : :
8810 : : bool
8811 : 22360 : vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8812 : : stmt_vec_info stmt_info,
8813 : : slp_tree slp_node, slp_instance slp_node_instance)
8814 : : {
8815 : 22360 : tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8816 : 22360 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8817 : 22360 : int i;
8818 : 22360 : bool nested_cycle = false;
8819 : 22360 : int vec_num;
8820 : :
8821 : 22459 : if (nested_in_vect_loop_p (loop, stmt_info))
8822 : : {
8823 : : loop = loop->inner;
8824 : : nested_cycle = true;
8825 : : }
8826 : :
8827 : 22360 : stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8828 : 22360 : reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8829 : 22360 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8830 : 22360 : gcc_assert (reduc_info->is_reduc_info);
8831 : :
8832 : 22360 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8833 : 22360 : || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8834 : : /* Leave the scalar phi in place. */
8835 : : return true;
8836 : :
8837 : 21526 : vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8838 : :
8839 : : /* Check whether we should use a single PHI node and accumulate
8840 : : vectors to one before the backedge. */
8841 : 21526 : if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8842 : 1415 : vec_num = 1;
8843 : :
8844 : : /* Create the destination vector */
8845 : 21526 : gphi *phi = as_a <gphi *> (stmt_info->stmt);
8846 : 21526 : tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8847 : : vectype_out);
8848 : :
8849 : : /* Get the loop-entry arguments. */
8850 : 21526 : tree vec_initial_def = NULL_TREE;
8851 : 21526 : auto_vec<tree> vec_initial_defs;
8852 : 21526 : vec_initial_defs.reserve (vec_num);
8853 : : /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8854 : : and we can't use zero for induc_val, use initial_def. Similarly
8855 : : for REDUC_MIN and initial_def larger than the base. */
8856 : 21526 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8857 : : {
8858 : 66 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8859 : 66 : tree initial_def = vect_phi_initial_value (phi);
8860 : 66 : reduc_info->reduc_initial_values.safe_push (initial_def);
8861 : 66 : tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8862 : 66 : if (TREE_CODE (initial_def) == INTEGER_CST
8863 : 64 : && !integer_zerop (induc_val)
8864 : 130 : && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8865 : 44 : && tree_int_cst_lt (initial_def, induc_val))
8866 : 61 : || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8867 : 20 : && tree_int_cst_lt (induc_val, initial_def))))
8868 : : {
8869 : 3 : induc_val = initial_def;
8870 : : /* Communicate we used the initial_def to epilouge
8871 : : generation. */
8872 : 3 : STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8873 : : }
8874 : 66 : vec_initial_defs.quick_push
8875 : 66 : (build_vector_from_val (vectype_out, induc_val));
8876 : : }
8877 : 21460 : else if (nested_cycle)
8878 : : {
8879 : 418 : unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8880 : 418 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8881 : : &vec_initial_defs);
8882 : : }
8883 : : else
8884 : : {
8885 : 21042 : gcc_assert (slp_node == slp_node_instance->reduc_phis);
8886 : 21042 : vec<tree> &initial_values = reduc_info->reduc_initial_values;
8887 : 21042 : vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8888 : :
8889 : 21042 : unsigned int num_phis = stmts.length ();
8890 : 21042 : if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8891 : 139 : num_phis = 1;
8892 : 21042 : initial_values.reserve (num_phis);
8893 : 42379 : for (unsigned int i = 0; i < num_phis; ++i)
8894 : : {
8895 : 21337 : gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8896 : 21337 : initial_values.quick_push (vect_phi_initial_value (this_phi));
8897 : : }
8898 : 21042 : if (vec_num == 1)
8899 : 20869 : vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8900 : 21042 : if (!initial_values.is_empty ())
8901 : : {
8902 : 20861 : tree initial_value
8903 : 41555 : = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8904 : 20861 : code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8905 : 20861 : tree neutral_op
8906 : 20861 : = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8907 : : code, initial_value);
8908 : : /* Try to simplify the vector initialization by applying an
8909 : : adjustment after the reduction has been performed. This
8910 : : can also break a critical path but on the other hand
8911 : : requires to keep the initial value live across the loop. */
8912 : 20861 : if (neutral_op
8913 : 20774 : && initial_values.length () == 1
8914 : 20621 : && !reduc_info->reused_accumulator
8915 : 16751 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8916 : 37546 : && !operand_equal_p (neutral_op, initial_values[0]))
8917 : : {
8918 : 12032 : STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8919 : 12032 : = initial_values[0];
8920 : 12032 : initial_values[0] = neutral_op;
8921 : : }
8922 : 41722 : get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8923 : : &vec_initial_defs, vec_num,
8924 : : stmts.length (), neutral_op);
8925 : : }
8926 : : }
8927 : :
8928 : 21526 : if (vec_initial_def)
8929 : : {
8930 : 0 : vec_initial_defs.create (1);
8931 : 0 : vec_initial_defs.quick_push (vec_initial_def);
8932 : : }
8933 : :
8934 : 21526 : if (auto *accumulator = reduc_info->reused_accumulator)
8935 : : {
8936 : 4063 : tree def = accumulator->reduc_input;
8937 : 4063 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8938 : : {
8939 : 4061 : unsigned int nreduc;
8940 : 8122 : bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8941 : 4061 : (TREE_TYPE (def)),
8942 : 4061 : TYPE_VECTOR_SUBPARTS (vectype_out),
8943 : : &nreduc);
8944 : 0 : gcc_assert (res);
8945 : 4061 : gimple_seq stmts = NULL;
8946 : : /* Reduce the single vector to a smaller one. */
8947 : 4061 : if (nreduc != 1)
8948 : : {
8949 : : /* Perform the reduction in the appropriate type. */
8950 : 4061 : tree rvectype = vectype_out;
8951 : 4061 : if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8952 : 4061 : TREE_TYPE (TREE_TYPE (def))))
8953 : 221 : rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8954 : : TYPE_VECTOR_SUBPARTS
8955 : 442 : (vectype_out));
8956 : 4061 : def = vect_create_partial_epilog (def, rvectype,
8957 : : STMT_VINFO_REDUC_CODE
8958 : : (reduc_info),
8959 : : &stmts);
8960 : : }
8961 : : /* The epilogue loop might use a different vector mode, like
8962 : : VNx2DI vs. V2DI. */
8963 : 4061 : if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8964 : : {
8965 : 0 : tree reduc_type = build_vector_type_for_mode
8966 : 0 : (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8967 : 0 : def = gimple_convert (&stmts, reduc_type, def);
8968 : : }
8969 : : /* Adjust the input so we pick up the partially reduced value
8970 : : for the skip edge in vect_create_epilog_for_reduction. */
8971 : 4061 : accumulator->reduc_input = def;
8972 : : /* And the reduction could be carried out using a different sign. */
8973 : 4061 : if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8974 : 221 : def = gimple_convert (&stmts, vectype_out, def);
8975 : 4061 : edge e;
8976 : 4061 : if ((e = loop_vinfo->main_loop_edge)
8977 : 4061 : || (e = loop_vinfo->skip_this_loop_edge))
8978 : : {
8979 : : /* While we'd like to insert on the edge this will split
8980 : : blocks and disturb bookkeeping, we also will eventually
8981 : : need this on the skip edge. Rely on sinking to
8982 : : fixup optimal placement and insert in the pred. */
8983 : 3880 : gimple_stmt_iterator gsi = gsi_last_bb (e->src);
8984 : : /* Insert before a cond that eventually skips the
8985 : : epilogue. */
8986 : 3880 : if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8987 : 3867 : gsi_prev (&gsi);
8988 : 3880 : gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8989 : : }
8990 : : else
8991 : 181 : gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8992 : : stmts);
8993 : : }
8994 : 4063 : if (loop_vinfo->main_loop_edge)
8995 : 3882 : vec_initial_defs[0]
8996 : 3882 : = vect_get_main_loop_result (loop_vinfo, def,
8997 : 3882 : vec_initial_defs[0]);
8998 : : else
8999 : 181 : vec_initial_defs.safe_push (def);
9000 : : }
9001 : :
9002 : : /* Generate the reduction PHIs upfront. */
9003 : 43716 : for (i = 0; i < vec_num; i++)
9004 : : {
9005 : 22190 : tree vec_init_def = vec_initial_defs[i];
9006 : : /* Create the reduction-phi that defines the reduction
9007 : : operand. */
9008 : 22190 : gphi *new_phi = create_phi_node (vec_dest, loop->header);
9009 : 22190 : add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
9010 : : UNKNOWN_LOCATION);
9011 : :
9012 : : /* The loop-latch arg is set in epilogue processing. */
9013 : :
9014 : 22190 : slp_node->push_vec_def (new_phi);
9015 : : }
9016 : :
9017 : 21526 : return true;
9018 : 21526 : }
9019 : :
9020 : : /* Vectorizes LC PHIs. */
9021 : :
9022 : : bool
9023 : 168983 : vectorizable_lc_phi (loop_vec_info loop_vinfo,
9024 : : stmt_vec_info stmt_info,
9025 : : slp_tree slp_node)
9026 : : {
9027 : 168983 : if (!loop_vinfo
9028 : 168983 : || !is_a <gphi *> (stmt_info->stmt)
9029 : 202999 : || gimple_phi_num_args (stmt_info->stmt) != 1)
9030 : : return false;
9031 : :
9032 : 837 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
9033 : 109 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
9034 : : return false;
9035 : :
9036 : : /* Deal with copies from externs or constants that disguise as
9037 : : loop-closed PHI nodes (PR97886). */
9038 : 837 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
9039 : : SLP_TREE_VECTYPE (slp_node)))
9040 : : {
9041 : 0 : if (dump_enabled_p ())
9042 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9043 : : "incompatible vector types for invariants\n");
9044 : 0 : return false;
9045 : : }
9046 : 837 : STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
9047 : 837 : return true;
9048 : : }
9049 : :
9050 : : bool
9051 : 348 : vect_transform_lc_phi (loop_vec_info loop_vinfo,
9052 : : stmt_vec_info stmt_info,
9053 : : slp_tree slp_node)
9054 : : {
9055 : :
9056 : 348 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9057 : 348 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9058 : 348 : basic_block bb = gimple_bb (stmt_info->stmt);
9059 : 348 : edge e = single_pred_edge (bb);
9060 : 348 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9061 : 348 : auto_vec<tree> vec_oprnds;
9062 : 696 : vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
9063 : 348 : gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9064 : 760 : for (unsigned i = 0; i < vec_oprnds.length (); i++)
9065 : : {
9066 : : /* Create the vectorized LC PHI node. */
9067 : 412 : gphi *new_phi = create_phi_node (vec_dest, bb);
9068 : 412 : add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9069 : 412 : slp_node->push_vec_def (new_phi);
9070 : : }
9071 : :
9072 : 348 : return true;
9073 : 348 : }
9074 : :
9075 : : /* Vectorizes PHIs. */
9076 : :
9077 : : bool
9078 : 166874 : vectorizable_phi (vec_info *,
9079 : : stmt_vec_info stmt_info, gimple **vec_stmt,
9080 : : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9081 : : {
9082 : 166874 : if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9083 : : return false;
9084 : :
9085 : 81142 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9086 : : return false;
9087 : :
9088 : 81142 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9089 : :
9090 : 81142 : if (!vec_stmt) /* transformation not required. */
9091 : : {
9092 : : slp_tree child;
9093 : : unsigned i;
9094 : 211963 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9095 : 144089 : if (!child)
9096 : : {
9097 : 0 : if (dump_enabled_p ())
9098 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9099 : : "PHI node with unvectorized backedge def\n");
9100 : 0 : return false;
9101 : : }
9102 : 144089 : else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9103 : : {
9104 : 25 : if (dump_enabled_p ())
9105 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9106 : : "incompatible vector types for invariants\n");
9107 : 25 : return false;
9108 : : }
9109 : 144064 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9110 : 144064 : && !useless_type_conversion_p (vectype,
9111 : : SLP_TREE_VECTYPE (child)))
9112 : : {
9113 : : /* With bools we can have mask and non-mask precision vectors
9114 : : or different non-mask precisions. while pattern recog is
9115 : : supposed to guarantee consistency here bugs in it can cause
9116 : : mismatches (PR103489 and PR103800 for example).
9117 : : Deal with them here instead of ICEing later. */
9118 : 16 : if (dump_enabled_p ())
9119 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9120 : : "incompatible vector type setup from "
9121 : : "bool pattern detection\n");
9122 : 16 : return false;
9123 : : }
9124 : :
9125 : : /* For single-argument PHIs assume coalescing which means zero cost
9126 : : for the scalar and the vector PHIs. This avoids artificially
9127 : : favoring the vector path (but may pessimize it in some cases). */
9128 : 67874 : if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9129 : 56865 : record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9130 : : vector_stmt, stmt_info, vectype, 0, vect_body);
9131 : 67874 : STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9132 : 67874 : return true;
9133 : : }
9134 : :
9135 : 13227 : tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9136 : 13227 : basic_block bb = gimple_bb (stmt_info->stmt);
9137 : 13227 : tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9138 : 13227 : auto_vec<gphi *> new_phis;
9139 : 44830 : for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9140 : : {
9141 : 31603 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9142 : :
9143 : : /* Skip not yet vectorized defs. */
9144 : 31985 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9145 : 31603 : && SLP_TREE_VEC_DEFS (child).is_empty ())
9146 : 382 : continue;
9147 : :
9148 : 31221 : auto_vec<tree> vec_oprnds;
9149 : 31221 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9150 : 31221 : if (!new_phis.exists ())
9151 : : {
9152 : 13227 : new_phis.create (vec_oprnds.length ());
9153 : 28036 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
9154 : : {
9155 : : /* Create the vectorized LC PHI node. */
9156 : 14809 : new_phis.quick_push (create_phi_node (vec_dest, bb));
9157 : 14809 : slp_node->push_vec_def (new_phis[j]);
9158 : : }
9159 : : }
9160 : 31221 : edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9161 : 67577 : for (unsigned j = 0; j < vec_oprnds.length (); j++)
9162 : 36356 : add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9163 : 31221 : }
9164 : : /* We should have at least one already vectorized child. */
9165 : 13227 : gcc_assert (new_phis.exists ());
9166 : :
9167 : 13227 : return true;
9168 : 13227 : }
9169 : :
9170 : : /* Vectorizes first order recurrences. An overview of the transformation
9171 : : is described below. Suppose we have the following loop.
9172 : :
9173 : : int t = 0;
9174 : : for (int i = 0; i < n; ++i)
9175 : : {
9176 : : b[i] = a[i] - t;
9177 : : t = a[i];
9178 : : }
9179 : :
9180 : : There is a first-order recurrence on 'a'. For this loop, the scalar IR
9181 : : looks (simplified) like:
9182 : :
9183 : : scalar.preheader:
9184 : : init = 0;
9185 : :
9186 : : scalar.body:
9187 : : i = PHI <0(scalar.preheader), i+1(scalar.body)>
9188 : : _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9189 : : _1 = a[i]
9190 : : b[i] = _1 - _2
9191 : : if (i < n) goto scalar.body
9192 : :
9193 : : In this example, _2 is a recurrence because it's value depends on the
9194 : : previous iteration. We vectorize this as (VF = 4)
9195 : :
9196 : : vector.preheader:
9197 : : vect_init = vect_cst(..., ..., ..., 0)
9198 : :
9199 : : vector.body
9200 : : i = PHI <0(vector.preheader), i+4(vector.body)>
9201 : : vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9202 : : vect_2 = a[i, i+1, i+2, i+3];
9203 : : vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9204 : : b[i, i+1, i+2, i+3] = vect_2 - vect_3
9205 : : if (..) goto vector.body
9206 : :
9207 : : In this function, vectorizable_recurr, we code generate both the
9208 : : vector PHI node and the permute since those together compute the
9209 : : vectorized value of the scalar PHI. We do not yet have the
9210 : : backedge value to fill in there nor into the vec_perm. Those
9211 : : are filled in vect_schedule_scc.
9212 : :
9213 : : TODO: Since the scalar loop does not have a use of the recurrence
9214 : : outside of the loop the natural way to implement peeling via
9215 : : vectorizing the live value doesn't work. For now peeling of loops
9216 : : with a recurrence is not implemented. For SLP the supported cases
9217 : : are restricted to those requiring a single vector recurrence PHI. */
9218 : :
9219 : : bool
9220 : 168178 : vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9221 : : gimple **vec_stmt, slp_tree slp_node,
9222 : : stmt_vector_for_cost *cost_vec)
9223 : : {
9224 : 168178 : if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9225 : : return false;
9226 : :
9227 : 33211 : gphi *phi = as_a<gphi *> (stmt_info->stmt);
9228 : :
9229 : : /* So far we only support first-order recurrence auto-vectorization. */
9230 : 33211 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9231 : : return false;
9232 : :
9233 : 262 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9234 : 262 : unsigned ncopies;
9235 : 262 : if (slp_node)
9236 : 262 : ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9237 : : else
9238 : 0 : ncopies = vect_get_num_copies (loop_vinfo, vectype);
9239 : 262 : poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9240 : 262 : unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9241 : : /* We need to be able to make progress with a single vector. */
9242 : 262 : if (maybe_gt (dist * 2, nunits))
9243 : : {
9244 : 0 : if (dump_enabled_p ())
9245 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9246 : : "first order recurrence exceeds half of "
9247 : : "a vector\n");
9248 : 0 : return false;
9249 : : }
9250 : :
9251 : : /* First-order recurrence autovectorization needs to handle permutation
9252 : : with indices = [nunits-1, nunits, nunits+1, ...]. */
9253 : 262 : vec_perm_builder sel (nunits, 1, 3);
9254 : 1048 : for (int i = 0; i < 3; ++i)
9255 : 786 : sel.quick_push (nunits - dist + i);
9256 : 262 : vec_perm_indices indices (sel, 2, nunits);
9257 : :
9258 : 262 : if (!vec_stmt) /* transformation not required. */
9259 : : {
9260 : 230 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9261 : : indices))
9262 : : return false;
9263 : :
9264 : 206 : if (slp_node)
9265 : : {
9266 : : /* We eventually need to set a vector type on invariant
9267 : : arguments. */
9268 : : unsigned j;
9269 : : slp_tree child;
9270 : 618 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9271 : 412 : if (!vect_maybe_update_slp_op_vectype
9272 : 412 : (child, SLP_TREE_VECTYPE (slp_node)))
9273 : : {
9274 : 0 : if (dump_enabled_p ())
9275 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9276 : : "incompatible vector types for "
9277 : : "invariants\n");
9278 : 0 : return false;
9279 : : }
9280 : : }
9281 : :
9282 : : /* Verify we have set up compatible types. */
9283 : 206 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9284 : 206 : tree latch_vectype = NULL_TREE;
9285 : 206 : if (slp_node)
9286 : : {
9287 : 206 : slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
9288 : 206 : latch_vectype = SLP_TREE_VECTYPE (latch_def);
9289 : : }
9290 : : else
9291 : : {
9292 : 0 : tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, le);
9293 : 0 : if (TREE_CODE (latch_def) == SSA_NAME)
9294 : : {
9295 : 0 : stmt_vec_info latch_def_info = loop_vinfo->lookup_def (latch_def);
9296 : 0 : latch_def_info = vect_stmt_to_vectorize (latch_def_info);
9297 : 0 : latch_vectype = STMT_VINFO_VECTYPE (latch_def_info);
9298 : : }
9299 : : }
9300 : 206 : if (!types_compatible_p (latch_vectype, vectype))
9301 : : return false;
9302 : :
9303 : : /* The recurrence costs the initialization vector and one permute
9304 : : for each copy. With SLP the prologue value is explicitly
9305 : : represented and costed separately. */
9306 : 198 : unsigned prologue_cost = 0;
9307 : 198 : if (!slp_node)
9308 : 0 : prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9309 : : stmt_info, 0, vect_prologue);
9310 : 198 : unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9311 : : stmt_info, 0, vect_body);
9312 : 198 : if (dump_enabled_p ())
9313 : 42 : dump_printf_loc (MSG_NOTE, vect_location,
9314 : : "vectorizable_recurr: inside_cost = %d, "
9315 : : "prologue_cost = %d .\n", inside_cost,
9316 : : prologue_cost);
9317 : :
9318 : 198 : STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9319 : 198 : return true;
9320 : : }
9321 : :
9322 : 32 : edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9323 : 32 : basic_block bb = gimple_bb (phi);
9324 : 32 : tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9325 : 32 : if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9326 : : {
9327 : 2 : gimple_seq stmts = NULL;
9328 : 2 : preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9329 : 2 : gsi_insert_seq_on_edge_immediate (pe, stmts);
9330 : : }
9331 : 32 : tree vec_init = build_vector_from_val (vectype, preheader);
9332 : 32 : vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9333 : :
9334 : : /* Create the vectorized first-order PHI node. */
9335 : 32 : tree vec_dest = vect_get_new_vect_var (vectype,
9336 : : vect_simple_var, "vec_recur_");
9337 : 32 : gphi *new_phi = create_phi_node (vec_dest, bb);
9338 : 32 : add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9339 : :
9340 : : /* Insert shuffles the first-order recurrence autovectorization.
9341 : : result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9342 : 32 : tree perm = vect_gen_perm_mask_checked (vectype, indices);
9343 : :
9344 : : /* Insert the required permute after the latch definition. The
9345 : : second and later operands are tentative and will be updated when we have
9346 : : vectorized the latch definition. */
9347 : 32 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9348 : 32 : gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9349 : 32 : gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9350 : 32 : gsi_next (&gsi2);
9351 : :
9352 : 84 : for (unsigned i = 0; i < ncopies; ++i)
9353 : : {
9354 : 52 : vec_dest = make_ssa_name (vectype);
9355 : 52 : gassign *vperm
9356 : 84 : = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9357 : 32 : i == 0 ? gimple_phi_result (new_phi) : NULL,
9358 : : NULL, perm);
9359 : 52 : vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9360 : :
9361 : 52 : if (slp_node)
9362 : 52 : slp_node->push_vec_def (vperm);
9363 : : else
9364 : 0 : STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9365 : : }
9366 : :
9367 : 32 : if (!slp_node)
9368 : 0 : *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9369 : : return true;
9370 : 262 : }
9371 : :
9372 : : /* Return true if VECTYPE represents a vector that requires lowering
9373 : : by the vector lowering pass. */
9374 : :
9375 : : bool
9376 : 660871 : vect_emulated_vector_p (tree vectype)
9377 : : {
9378 : 1321742 : return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9379 : 663488 : && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9380 : 2599 : || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9381 : : }
9382 : :
9383 : : /* Return true if we can emulate CODE on an integer mode representation
9384 : : of a vector. */
9385 : :
9386 : : bool
9387 : 21567 : vect_can_vectorize_without_simd_p (tree_code code)
9388 : : {
9389 : 21567 : switch (code)
9390 : : {
9391 : : case PLUS_EXPR:
9392 : : case MINUS_EXPR:
9393 : : case NEGATE_EXPR:
9394 : : case BIT_AND_EXPR:
9395 : : case BIT_IOR_EXPR:
9396 : : case BIT_XOR_EXPR:
9397 : : case BIT_NOT_EXPR:
9398 : : return true;
9399 : :
9400 : 12927 : default:
9401 : 12927 : return false;
9402 : : }
9403 : : }
9404 : :
9405 : : /* Likewise, but taking a code_helper. */
9406 : :
9407 : : bool
9408 : 1565 : vect_can_vectorize_without_simd_p (code_helper code)
9409 : : {
9410 : 1565 : return (code.is_tree_code ()
9411 : 1565 : && vect_can_vectorize_without_simd_p (tree_code (code)));
9412 : : }
9413 : :
9414 : : /* Create vector init for vectorized iv. */
9415 : : static tree
9416 : 835 : vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9417 : : tree step_expr, poly_uint64 nunits,
9418 : : tree vectype,
9419 : : enum vect_induction_op_type induction_type)
9420 : : {
9421 : 835 : unsigned HOST_WIDE_INT const_nunits;
9422 : 835 : tree vec_shift, vec_init, new_name;
9423 : 835 : unsigned i;
9424 : 835 : tree itype = TREE_TYPE (vectype);
9425 : :
9426 : : /* iv_loop is the loop to be vectorized. Create:
9427 : : vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9428 : 835 : new_name = gimple_convert (stmts, itype, init_expr);
9429 : 835 : switch (induction_type)
9430 : : {
9431 : 18 : case vect_step_op_shr:
9432 : 18 : case vect_step_op_shl:
9433 : : /* Build the Initial value from shift_expr. */
9434 : 18 : vec_init = gimple_build_vector_from_val (stmts,
9435 : : vectype,
9436 : : new_name);
9437 : 18 : vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9438 : : build_zero_cst (itype), step_expr);
9439 : 18 : vec_init = gimple_build (stmts,
9440 : : (induction_type == vect_step_op_shr
9441 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
9442 : : vectype, vec_init, vec_shift);
9443 : 18 : break;
9444 : :
9445 : 741 : case vect_step_op_neg:
9446 : 741 : {
9447 : 741 : vec_init = gimple_build_vector_from_val (stmts,
9448 : : vectype,
9449 : : new_name);
9450 : 741 : tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9451 : : vectype, vec_init);
9452 : : /* The encoding has 2 interleaved stepped patterns. */
9453 : 741 : vec_perm_builder sel (nunits, 2, 3);
9454 : 741 : sel.quick_grow (6);
9455 : 3705 : for (i = 0; i < 3; i++)
9456 : : {
9457 : 2223 : sel[2 * i] = i;
9458 : 2223 : sel[2 * i + 1] = i + nunits;
9459 : : }
9460 : 741 : vec_perm_indices indices (sel, 2, nunits);
9461 : : /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9462 : : fail when vec_init is const vector. In that situation vec_perm is not
9463 : : really needed. */
9464 : 741 : tree perm_mask_even
9465 : 741 : = vect_gen_perm_mask_any (vectype, indices);
9466 : 741 : vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9467 : : vectype,
9468 : : vec_init, vec_neg,
9469 : : perm_mask_even);
9470 : 741 : }
9471 : 741 : break;
9472 : :
9473 : 76 : case vect_step_op_mul:
9474 : 76 : {
9475 : : /* Use unsigned mult to avoid UD integer overflow. */
9476 : 76 : gcc_assert (nunits.is_constant (&const_nunits));
9477 : 76 : tree utype = unsigned_type_for (itype);
9478 : 76 : tree uvectype = build_vector_type (utype,
9479 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
9480 : 76 : new_name = gimple_convert (stmts, utype, new_name);
9481 : 76 : vec_init = gimple_build_vector_from_val (stmts,
9482 : : uvectype,
9483 : : new_name);
9484 : 76 : tree_vector_builder elts (uvectype, const_nunits, 1);
9485 : 76 : tree elt_step = build_one_cst (utype);
9486 : :
9487 : 76 : elts.quick_push (elt_step);
9488 : 660 : for (i = 1; i < const_nunits; i++)
9489 : : {
9490 : : /* Create: new_name_i = new_name + step_expr. */
9491 : 508 : elt_step = gimple_build (stmts, MULT_EXPR,
9492 : : utype, elt_step, step_expr);
9493 : 508 : elts.quick_push (elt_step);
9494 : : }
9495 : : /* Create a vector from [new_name_0, new_name_1, ...,
9496 : : new_name_nunits-1]. */
9497 : 76 : tree vec_mul = gimple_build_vector (stmts, &elts);
9498 : 76 : vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9499 : : vec_init, vec_mul);
9500 : 76 : vec_init = gimple_convert (stmts, vectype, vec_init);
9501 : 76 : }
9502 : 76 : break;
9503 : :
9504 : 0 : default:
9505 : 0 : gcc_unreachable ();
9506 : : }
9507 : :
9508 : 835 : return vec_init;
9509 : : }
9510 : :
9511 : : /* Peel init_expr by skip_niter for induction_type. */
9512 : : tree
9513 : 84 : vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9514 : : tree skip_niters, tree step_expr,
9515 : : enum vect_induction_op_type induction_type)
9516 : : {
9517 : 84 : gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9518 : 84 : tree type = TREE_TYPE (init_expr);
9519 : 84 : unsigned prec = TYPE_PRECISION (type);
9520 : 84 : switch (induction_type)
9521 : : {
9522 : 0 : case vect_step_op_neg:
9523 : 0 : if (TREE_INT_CST_LOW (skip_niters) % 2)
9524 : 0 : init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9525 : : /* else no change. */
9526 : : break;
9527 : :
9528 : 12 : case vect_step_op_shr:
9529 : 12 : case vect_step_op_shl:
9530 : 12 : skip_niters = gimple_convert (stmts, type, skip_niters);
9531 : 12 : step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9532 : : /* When shift mount >= precision, need to avoid UD.
9533 : : In the original loop, there's no UD, and according to semantic,
9534 : : init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9535 : 12 : if (!tree_fits_uhwi_p (step_expr)
9536 : 12 : || tree_to_uhwi (step_expr) >= prec)
9537 : : {
9538 : 6 : if (induction_type == vect_step_op_shl
9539 : 6 : || TYPE_UNSIGNED (type))
9540 : 4 : init_expr = build_zero_cst (type);
9541 : : else
9542 : 2 : init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9543 : : init_expr,
9544 : 4 : wide_int_to_tree (type, prec - 1));
9545 : : }
9546 : : else
9547 : 8 : init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9548 : : ? RSHIFT_EXPR : LSHIFT_EXPR),
9549 : : type, init_expr, step_expr);
9550 : : break;
9551 : :
9552 : 72 : case vect_step_op_mul:
9553 : 72 : {
9554 : 72 : tree utype = unsigned_type_for (type);
9555 : 72 : init_expr = gimple_convert (stmts, utype, init_expr);
9556 : 72 : wide_int skipn = wi::to_wide (skip_niters);
9557 : 72 : wide_int begin = wi::to_wide (step_expr);
9558 : 72 : auto_mpz base, exp, mod, res;
9559 : 72 : wi::to_mpz (begin, base, TYPE_SIGN (type));
9560 : 72 : wi::to_mpz (skipn, exp, UNSIGNED);
9561 : 72 : mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9562 : 72 : mpz_powm (res, base, exp, mod);
9563 : 72 : begin = wi::from_mpz (utype, res, true);
9564 : 72 : tree mult_expr = wide_int_to_tree (utype, begin);
9565 : 72 : init_expr = gimple_build (stmts, MULT_EXPR, utype,
9566 : : init_expr, mult_expr);
9567 : 72 : init_expr = gimple_convert (stmts, type, init_expr);
9568 : 72 : }
9569 : 72 : break;
9570 : :
9571 : 0 : default:
9572 : 0 : gcc_unreachable ();
9573 : : }
9574 : :
9575 : 84 : return init_expr;
9576 : : }
9577 : :
9578 : : /* Create vector step for vectorized iv. */
9579 : : static tree
9580 : 1072 : vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9581 : : poly_uint64 vf,
9582 : : enum vect_induction_op_type induction_type)
9583 : : {
9584 : 1072 : tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9585 : 1072 : tree new_name = NULL;
9586 : : /* Step should be pow (step, vf) for mult induction. */
9587 : 1072 : if (induction_type == vect_step_op_mul)
9588 : : {
9589 : 76 : gcc_assert (vf.is_constant ());
9590 : 76 : wide_int begin = wi::to_wide (step_expr);
9591 : :
9592 : 584 : for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9593 : 508 : begin = wi::mul (begin, wi::to_wide (step_expr));
9594 : :
9595 : 76 : new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9596 : 76 : }
9597 : 996 : else if (induction_type == vect_step_op_neg)
9598 : : /* Do nothing. */
9599 : : ;
9600 : : else
9601 : 18 : new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9602 : : expr, step_expr);
9603 : 1072 : return new_name;
9604 : : }
9605 : :
9606 : : static tree
9607 : 1072 : vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9608 : : stmt_vec_info stmt_info,
9609 : : tree new_name, tree vectype,
9610 : : enum vect_induction_op_type induction_type)
9611 : : {
9612 : : /* No step is needed for neg induction. */
9613 : 1072 : if (induction_type == vect_step_op_neg)
9614 : : return NULL;
9615 : :
9616 : 94 : tree t = unshare_expr (new_name);
9617 : 94 : gcc_assert (CONSTANT_CLASS_P (new_name)
9618 : : || TREE_CODE (new_name) == SSA_NAME);
9619 : 94 : tree new_vec = build_vector_from_val (vectype, t);
9620 : 94 : tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9621 : : new_vec, vectype, NULL);
9622 : 94 : return vec_step;
9623 : : }
9624 : :
9625 : : /* Update vectorized iv with vect_step, induc_def is init. */
9626 : : static tree
9627 : 1254 : vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9628 : : tree induc_def, tree vec_step,
9629 : : enum vect_induction_op_type induction_type)
9630 : : {
9631 : 1254 : tree vec_def = induc_def;
9632 : 1254 : switch (induction_type)
9633 : : {
9634 : 76 : case vect_step_op_mul:
9635 : 76 : {
9636 : : /* Use unsigned mult to avoid UD integer overflow. */
9637 : 76 : tree uvectype
9638 : 76 : = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9639 : 76 : TYPE_VECTOR_SUBPARTS (vectype));
9640 : 76 : vec_def = gimple_convert (stmts, uvectype, vec_def);
9641 : 76 : vec_step = gimple_convert (stmts, uvectype, vec_step);
9642 : 76 : vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9643 : : vec_def, vec_step);
9644 : 76 : vec_def = gimple_convert (stmts, vectype, vec_def);
9645 : : }
9646 : 76 : break;
9647 : :
9648 : 12 : case vect_step_op_shr:
9649 : 12 : vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9650 : : vec_def, vec_step);
9651 : 12 : break;
9652 : :
9653 : 6 : case vect_step_op_shl:
9654 : 6 : vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9655 : : vec_def, vec_step);
9656 : 6 : break;
9657 : : case vect_step_op_neg:
9658 : : vec_def = induc_def;
9659 : : /* Do nothing. */
9660 : : break;
9661 : 0 : default:
9662 : 0 : gcc_unreachable ();
9663 : : }
9664 : :
9665 : 1254 : return vec_def;
9666 : :
9667 : : }
9668 : :
9669 : : /* Function vectorizable_nonlinear_induction
9670 : :
9671 : : Check if STMT_INFO performs an nonlinear induction computation that can be
9672 : : vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9673 : : a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9674 : : basic block.
9675 : : Return true if STMT_INFO is vectorizable in this way. */
9676 : :
9677 : : static bool
9678 : 10052 : vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9679 : : stmt_vec_info stmt_info,
9680 : : gimple **vec_stmt, slp_tree slp_node,
9681 : : stmt_vector_for_cost *cost_vec)
9682 : : {
9683 : 10052 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9684 : 10052 : unsigned ncopies;
9685 : 10052 : bool nested_in_vect_loop = false;
9686 : 10052 : class loop *iv_loop;
9687 : 10052 : tree vec_def;
9688 : 10052 : edge pe = loop_preheader_edge (loop);
9689 : 10052 : basic_block new_bb;
9690 : 10052 : tree vec_init, vec_step;
9691 : 10052 : tree new_name;
9692 : 10052 : gimple *new_stmt;
9693 : 10052 : gphi *induction_phi;
9694 : 10052 : tree induc_def, vec_dest;
9695 : 10052 : tree init_expr, step_expr;
9696 : 10052 : tree niters_skip;
9697 : 10052 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9698 : 10052 : unsigned i;
9699 : 10052 : gimple_stmt_iterator si;
9700 : :
9701 : 10052 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9702 : :
9703 : 10052 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9704 : 10052 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9705 : 10052 : enum vect_induction_op_type induction_type
9706 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9707 : :
9708 : 10052 : gcc_assert (induction_type > vect_step_op_add);
9709 : :
9710 : 10052 : ncopies = vect_get_num_copies (loop_vinfo, slp_node, vectype);
9711 : 10052 : gcc_assert (ncopies >= 1);
9712 : :
9713 : : /* FORNOW. Only handle nonlinear induction in the same loop. */
9714 : 10052 : if (nested_in_vect_loop_p (loop, stmt_info))
9715 : : {
9716 : 0 : if (dump_enabled_p ())
9717 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9718 : : "nonlinear induction in nested loop.\n");
9719 : 0 : return false;
9720 : : }
9721 : :
9722 : 10052 : iv_loop = loop;
9723 : 10052 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9724 : :
9725 : : /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
9726 : : vector iv update for each iv and a permutation to generate wanted
9727 : : vector iv. */
9728 : 10052 : if (SLP_TREE_LANES (slp_node) > 1)
9729 : : {
9730 : 0 : if (dump_enabled_p ())
9731 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9732 : : "SLP induction not supported for nonlinear"
9733 : : " induction.\n");
9734 : 0 : return false;
9735 : : }
9736 : :
9737 : 10052 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9738 : : {
9739 : 0 : if (dump_enabled_p ())
9740 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9741 : : "floating point nonlinear induction vectorization"
9742 : : " not supported.\n");
9743 : 0 : return false;
9744 : : }
9745 : :
9746 : 10052 : step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9747 : 10052 : init_expr = vect_phi_initial_value (phi);
9748 : 10052 : gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9749 : : && TREE_CODE (step_expr) == INTEGER_CST);
9750 : : /* step_expr should be aligned with init_expr,
9751 : : .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9752 : 10052 : step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9753 : :
9754 : 10052 : if (TREE_CODE (init_expr) == INTEGER_CST)
9755 : 2498 : init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9756 : 7554 : else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9757 : : {
9758 : : /* INIT_EXPR could be a bit_field, bail out for such case. */
9759 : 4 : if (dump_enabled_p ())
9760 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9761 : : "nonlinear induction vectorization failed:"
9762 : : " component type of vectype is not a nop conversion"
9763 : : " from type of init_expr.\n");
9764 : 4 : return false;
9765 : : }
9766 : :
9767 : 10048 : switch (induction_type)
9768 : : {
9769 : 2214 : case vect_step_op_neg:
9770 : 2214 : if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9771 : : return false;
9772 : 2214 : if (TREE_CODE (init_expr) != INTEGER_CST
9773 : 199 : && TREE_CODE (init_expr) != REAL_CST)
9774 : : {
9775 : : /* Check for backend support of NEGATE_EXPR and vec_perm. */
9776 : 199 : if (!directly_supported_p (NEGATE_EXPR, vectype))
9777 : 0 : return false;
9778 : :
9779 : : /* The encoding has 2 interleaved stepped patterns. */
9780 : 199 : vec_perm_builder sel (nunits, 2, 3);
9781 : 199 : machine_mode mode = TYPE_MODE (vectype);
9782 : 199 : sel.quick_grow (6);
9783 : 995 : for (i = 0; i < 3; i++)
9784 : : {
9785 : 597 : sel[i * 2] = i;
9786 : 597 : sel[i * 2 + 1] = i + nunits;
9787 : : }
9788 : 199 : vec_perm_indices indices (sel, 2, nunits);
9789 : 199 : if (!can_vec_perm_const_p (mode, mode, indices))
9790 : 0 : return false;
9791 : 199 : }
9792 : : break;
9793 : :
9794 : 732 : case vect_step_op_mul:
9795 : 732 : {
9796 : : /* Check for backend support of MULT_EXPR. */
9797 : 732 : if (!directly_supported_p (MULT_EXPR, vectype))
9798 : : return false;
9799 : :
9800 : : /* ?? How to construct vector step for variable number vector.
9801 : : [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9802 : : if (!vf.is_constant ())
9803 : : return false;
9804 : : }
9805 : : break;
9806 : :
9807 : 7004 : case vect_step_op_shr:
9808 : : /* Check for backend support of RSHIFT_EXPR. */
9809 : 7004 : if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9810 : : return false;
9811 : :
9812 : : /* Don't shift more than type precision to avoid UD. */
9813 : 30 : if (!tree_fits_uhwi_p (step_expr)
9814 : 30 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9815 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9816 : : return false;
9817 : : break;
9818 : :
9819 : 98 : case vect_step_op_shl:
9820 : : /* Check for backend support of RSHIFT_EXPR. */
9821 : 98 : if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9822 : : return false;
9823 : :
9824 : : /* Don't shift more than type precision to avoid UD. */
9825 : 14 : if (!tree_fits_uhwi_p (step_expr)
9826 : 14 : || maybe_ge (nunits * tree_to_uhwi (step_expr),
9827 : : TYPE_PRECISION (TREE_TYPE (init_expr))))
9828 : : return false;
9829 : :
9830 : : break;
9831 : :
9832 : 0 : default:
9833 : 0 : gcc_unreachable ();
9834 : : }
9835 : :
9836 : 2829 : if (!vec_stmt) /* transformation not required. */
9837 : : {
9838 : 1994 : unsigned inside_cost = 0, prologue_cost = 0;
9839 : : /* loop cost for vec_loop. Neg induction doesn't have any
9840 : : inside_cost. */
9841 : 1994 : inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9842 : : stmt_info, 0, vect_body);
9843 : :
9844 : : /* loop cost for vec_loop. Neg induction doesn't have any
9845 : : inside_cost. */
9846 : 1994 : if (induction_type == vect_step_op_neg)
9847 : 1473 : inside_cost = 0;
9848 : :
9849 : : /* prologue cost for vec_init and vec_step. */
9850 : 1994 : prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9851 : : stmt_info, 0, vect_prologue);
9852 : :
9853 : 1994 : if (dump_enabled_p ())
9854 : 74 : dump_printf_loc (MSG_NOTE, vect_location,
9855 : : "vect_model_induction_cost: inside_cost = %d, "
9856 : : "prologue_cost = %d. \n", inside_cost,
9857 : : prologue_cost);
9858 : :
9859 : 1994 : STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9860 : 1994 : DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9861 : 1994 : return true;
9862 : : }
9863 : :
9864 : : /* Transform. */
9865 : :
9866 : : /* Compute a vector variable, initialized with the first VF values of
9867 : : the induction variable. E.g., for an iv with IV_PHI='X' and
9868 : : evolution S, for a vector of 4 units, we want to compute:
9869 : : [X, X + S, X + 2*S, X + 3*S]. */
9870 : :
9871 : 835 : if (dump_enabled_p ())
9872 : 34 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9873 : :
9874 : 835 : pe = loop_preheader_edge (iv_loop);
9875 : : /* Find the first insertion point in the BB. */
9876 : 835 : basic_block bb = gimple_bb (phi);
9877 : 835 : si = gsi_after_labels (bb);
9878 : :
9879 : 835 : gimple_seq stmts = NULL;
9880 : :
9881 : 835 : niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9882 : : /* If we are using the loop mask to "peel" for alignment then we need
9883 : : to adjust the start value here. */
9884 : 835 : if (niters_skip != NULL_TREE)
9885 : 0 : init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9886 : : step_expr, induction_type);
9887 : :
9888 : 835 : vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9889 : : step_expr, nunits, vectype,
9890 : : induction_type);
9891 : 835 : if (stmts)
9892 : : {
9893 : 164 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9894 : 164 : gcc_assert (!new_bb);
9895 : : }
9896 : :
9897 : 835 : stmts = NULL;
9898 : 835 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9899 : : vf, induction_type);
9900 : 835 : if (stmts)
9901 : : {
9902 : 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9903 : 0 : gcc_assert (!new_bb);
9904 : : }
9905 : :
9906 : 835 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9907 : : new_name, vectype,
9908 : : induction_type);
9909 : : /* Create the following def-use cycle:
9910 : : loop prolog:
9911 : : vec_init = ...
9912 : : vec_step = ...
9913 : : loop:
9914 : : vec_iv = PHI <vec_init, vec_loop>
9915 : : ...
9916 : : STMT
9917 : : ...
9918 : : vec_loop = vec_iv + vec_step; */
9919 : :
9920 : : /* Create the induction-phi that defines the induction-operand. */
9921 : 835 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9922 : 835 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
9923 : 835 : induc_def = PHI_RESULT (induction_phi);
9924 : :
9925 : : /* Create the iv update inside the loop. */
9926 : 835 : stmts = NULL;
9927 : 835 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9928 : : induc_def, vec_step,
9929 : : induction_type);
9930 : :
9931 : 835 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9932 : 835 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9933 : :
9934 : : /* Set the arguments of the phi node: */
9935 : 835 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9936 : 835 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9937 : : UNKNOWN_LOCATION);
9938 : :
9939 : 835 : slp_node->push_vec_def (induction_phi);
9940 : :
9941 : : /* In case that vectorization factor (VF) is bigger than the number
9942 : : of elements that we can fit in a vectype (nunits), we have to generate
9943 : : more than one vector stmt - i.e - we need to "unroll" the
9944 : : vector stmt by a factor VF/nunits. For more details see documentation
9945 : : in vectorizable_operation. */
9946 : :
9947 : 835 : if (ncopies > 1)
9948 : : {
9949 : 237 : stmts = NULL;
9950 : : /* FORNOW. This restriction should be relaxed. */
9951 : 237 : gcc_assert (!nested_in_vect_loop);
9952 : :
9953 : 237 : new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9954 : : nunits, induction_type);
9955 : :
9956 : 237 : vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9957 : : new_name, vectype,
9958 : : induction_type);
9959 : 237 : vec_def = induc_def;
9960 : 893 : for (i = 1; i < ncopies; i++)
9961 : : {
9962 : : /* vec_i = vec_prev + vec_step. */
9963 : 419 : stmts = NULL;
9964 : 419 : vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9965 : : vec_def, vec_step,
9966 : : induction_type);
9967 : 419 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9968 : 419 : new_stmt = SSA_NAME_DEF_STMT (vec_def);
9969 : 419 : slp_node->push_vec_def (new_stmt);
9970 : : }
9971 : : }
9972 : :
9973 : 835 : if (dump_enabled_p ())
9974 : 68 : dump_printf_loc (MSG_NOTE, vect_location,
9975 : : "transform induction: created def-use cycle: %G%G",
9976 : 34 : (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9977 : :
9978 : : return true;
9979 : : }
9980 : :
9981 : : /* Function vectorizable_induction
9982 : :
9983 : : Check if STMT_INFO performs an induction computation that can be vectorized.
9984 : : If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9985 : : phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9986 : : Return true if STMT_INFO is vectorizable in this way. */
9987 : :
9988 : : bool
9989 : 411741 : vectorizable_induction (loop_vec_info loop_vinfo,
9990 : : stmt_vec_info stmt_info,
9991 : : gimple **vec_stmt, slp_tree slp_node,
9992 : : stmt_vector_for_cost *cost_vec)
9993 : : {
9994 : 411741 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9995 : 411741 : bool nested_in_vect_loop = false;
9996 : 411741 : class loop *iv_loop;
9997 : 411741 : tree vec_def;
9998 : 411741 : edge pe = loop_preheader_edge (loop);
9999 : 411741 : basic_block new_bb;
10000 : 411741 : tree vec_init = NULL_TREE, vec_step, t;
10001 : 411741 : tree new_name;
10002 : 411741 : gphi *induction_phi;
10003 : 411741 : tree induc_def, vec_dest;
10004 : 411741 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10005 : 411741 : unsigned i;
10006 : 411741 : tree expr;
10007 : 411741 : tree index_vectype = NULL_TREE;
10008 : 411741 : gimple_stmt_iterator si;
10009 : 411741 : enum vect_induction_op_type induction_type
10010 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
10011 : :
10012 : 438534 : gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
10013 : 156896 : if (!phi)
10014 : : return false;
10015 : :
10016 : 156896 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10017 : : return false;
10018 : :
10019 : : /* Make sure it was recognized as induction computation. */
10020 : 156896 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
10021 : : return false;
10022 : :
10023 : : /* Handle nonlinear induction in a separate place. */
10024 : 152850 : if (induction_type != vect_step_op_add)
10025 : 10052 : return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
10026 : 10052 : vec_stmt, slp_node, cost_vec);
10027 : :
10028 : 142798 : tree vectype = SLP_TREE_VECTYPE (slp_node);
10029 : 142798 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10030 : :
10031 : : /* FORNOW. These restrictions should be relaxed. */
10032 : 142798 : if (nested_in_vect_loop_p (loop, stmt_info))
10033 : : {
10034 : 666 : imm_use_iterator imm_iter;
10035 : 666 : use_operand_p use_p;
10036 : 666 : gimple *exit_phi;
10037 : 666 : edge latch_e;
10038 : 666 : tree loop_arg;
10039 : :
10040 : 666 : exit_phi = NULL;
10041 : 666 : latch_e = loop_latch_edge (loop->inner);
10042 : 666 : loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
10043 : 1374 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
10044 : : {
10045 : 718 : gimple *use_stmt = USE_STMT (use_p);
10046 : 718 : if (is_gimple_debug (use_stmt))
10047 : 36 : continue;
10048 : :
10049 : 682 : if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
10050 : : {
10051 : : exit_phi = use_stmt;
10052 : : break;
10053 : : }
10054 : : }
10055 : 666 : if (exit_phi)
10056 : : {
10057 : 10 : stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10058 : 10 : if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10059 : 6 : && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10060 : : {
10061 : 4 : if (dump_enabled_p ())
10062 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10063 : : "inner-loop induction only used outside "
10064 : : "of the outer vectorized loop.\n");
10065 : 4 : return false;
10066 : : }
10067 : : }
10068 : :
10069 : 662 : nested_in_vect_loop = true;
10070 : 662 : iv_loop = loop->inner;
10071 : : }
10072 : : else
10073 : : iv_loop = loop;
10074 : 142794 : gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10075 : :
10076 : 142794 : if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
10077 : : {
10078 : : /* The current SLP code creates the step value element-by-element. */
10079 : : if (dump_enabled_p ())
10080 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10081 : : "SLP induction not supported for variable-length"
10082 : : " vectors.\n");
10083 : : return false;
10084 : : }
10085 : :
10086 : 142794 : if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10087 : : {
10088 : 12 : if (dump_enabled_p ())
10089 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10090 : : "floating point induction vectorization disabled\n");
10091 : 12 : return false;
10092 : : }
10093 : :
10094 : 142782 : tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10095 : 142782 : gcc_assert (step_expr != NULL_TREE);
10096 : 285518 : if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10097 : 285423 : && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10098 : : {
10099 : 12 : if (dump_enabled_p ())
10100 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10101 : : "bit-precision induction vectorization not "
10102 : : "supported.\n");
10103 : 12 : return false;
10104 : : }
10105 : 142770 : tree stept = TREE_TYPE (step_expr);
10106 : 142770 : tree step_vectype = get_same_sized_vectype (stept, vectype);
10107 : :
10108 : : /* Check for target support of the vectorized arithmetic used here. */
10109 : 142770 : if (!target_supports_op_p (step_vectype, PLUS_EXPR, optab_default)
10110 : 142770 : || !target_supports_op_p (step_vectype, MINUS_EXPR, optab_default))
10111 : 22719 : return false;
10112 : 120051 : if (!nunits.is_constant ())
10113 : : {
10114 : : if (!target_supports_op_p (step_vectype, MULT_EXPR, optab_default))
10115 : : return false;
10116 : : /* FLOAT_EXPR when computing VEC_INIT for float inductions. */
10117 : : if (SCALAR_FLOAT_TYPE_P (stept))
10118 : : {
10119 : : tree index_type = build_nonstandard_integer_type
10120 : : (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (stept)), 1);
10121 : :
10122 : : index_vectype = build_vector_type (index_type, nunits);
10123 : : if (!can_float_p (TYPE_MODE (step_vectype),
10124 : : TYPE_MODE (index_vectype), 1))
10125 : : return false;
10126 : : }
10127 : : }
10128 : :
10129 : 120051 : if (!vec_stmt) /* transformation not required. */
10130 : : {
10131 : 314907 : unsigned inside_cost = 0, prologue_cost = 0;
10132 : : /* We eventually need to set a vector type on invariant
10133 : : arguments. */
10134 : : unsigned j;
10135 : : slp_tree child;
10136 : 314907 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10137 : 209938 : if (!vect_maybe_update_slp_op_vectype
10138 : 209938 : (child, SLP_TREE_VECTYPE (slp_node)))
10139 : : {
10140 : 0 : if (dump_enabled_p ())
10141 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10142 : : "incompatible vector types for "
10143 : : "invariants\n");
10144 : 0 : return false;
10145 : : }
10146 : : /* loop cost for vec_loop. */
10147 : 209938 : inside_cost = record_stmt_cost (cost_vec,
10148 : 104969 : SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10149 : : vector_stmt, stmt_info, 0, vect_body);
10150 : : /* prologue cost for vec_init (if not nested) and step. */
10151 : 104969 : prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10152 : : scalar_to_vec,
10153 : : stmt_info, 0, vect_prologue);
10154 : 104969 : if (dump_enabled_p ())
10155 : 4527 : dump_printf_loc (MSG_NOTE, vect_location,
10156 : : "vect_model_induction_cost: inside_cost = %d, "
10157 : : "prologue_cost = %d .\n", inside_cost,
10158 : : prologue_cost);
10159 : :
10160 : 104969 : STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10161 : 104969 : DUMP_VECT_SCOPE ("vectorizable_induction");
10162 : 104969 : return true;
10163 : : }
10164 : :
10165 : : /* Transform. */
10166 : :
10167 : : /* Compute a vector variable, initialized with the first VF values of
10168 : : the induction variable. E.g., for an iv with IV_PHI='X' and
10169 : : evolution S, for a vector of 4 units, we want to compute:
10170 : : [X, X + S, X + 2*S, X + 3*S]. */
10171 : :
10172 : 15082 : if (dump_enabled_p ())
10173 : 2766 : dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10174 : :
10175 : 15082 : pe = loop_preheader_edge (iv_loop);
10176 : : /* Find the first insertion point in the BB. */
10177 : 15082 : basic_block bb = gimple_bb (phi);
10178 : 15082 : si = gsi_after_labels (bb);
10179 : :
10180 : : /* For SLP induction we have to generate several IVs as for example
10181 : : with group size 3 we need
10182 : : [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10183 : : [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10184 : 15082 : gimple_stmt_iterator incr_si;
10185 : 15082 : bool insert_after;
10186 : 15082 : standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
10187 : :
10188 : : /* The initial values are vectorized, but any lanes > group_size
10189 : : need adjustment. */
10190 : 15082 : slp_tree init_node
10191 : 15082 : = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10192 : :
10193 : : /* Gather steps. Since we do not vectorize inductions as
10194 : : cycles we have to reconstruct the step from SCEV data. */
10195 : 15082 : unsigned group_size = SLP_TREE_LANES (slp_node);
10196 : 15082 : tree *steps = XALLOCAVEC (tree, group_size);
10197 : 15082 : tree *inits = XALLOCAVEC (tree, group_size);
10198 : 15082 : stmt_vec_info phi_info;
10199 : 31348 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10200 : : {
10201 : 16266 : steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10202 : 16266 : if (!init_node)
10203 : 16152 : inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10204 : : pe->dest_idx);
10205 : : }
10206 : :
10207 : : /* Now generate the IVs. */
10208 : 15082 : unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10209 : 30164 : gcc_assert (multiple_p (nunits * nvects, group_size));
10210 : 15082 : unsigned nivs;
10211 : 15082 : unsigned HOST_WIDE_INT const_nunits;
10212 : 15082 : if (nested_in_vect_loop)
10213 : : nivs = nvects;
10214 : 14982 : else if (nunits.is_constant (&const_nunits))
10215 : : {
10216 : : /* Compute the number of distinct IVs we need. First reduce
10217 : : group_size if it is a multiple of const_nunits so we get
10218 : : one IV for a group_size of 4 but const_nunits 2. */
10219 : 14982 : unsigned group_sizep = group_size;
10220 : 14982 : if (group_sizep % const_nunits == 0)
10221 : 109 : group_sizep = group_sizep / const_nunits;
10222 : 14982 : nivs = least_common_multiple (group_sizep, const_nunits) / const_nunits;
10223 : : }
10224 : : else
10225 : : {
10226 : : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10227 : : nivs = 1;
10228 : : }
10229 : 15082 : gimple_seq init_stmts = NULL;
10230 : 15082 : tree lupdate_mul = NULL_TREE;
10231 : 100 : if (!nested_in_vect_loop)
10232 : : {
10233 : 14982 : if (nunits.is_constant (&const_nunits))
10234 : : {
10235 : : /* The number of iterations covered in one vector iteration. */
10236 : 14982 : unsigned lup_mul = (nvects * const_nunits) / group_size;
10237 : 14982 : lupdate_mul
10238 : 14982 : = build_vector_from_val (step_vectype,
10239 : 14982 : SCALAR_FLOAT_TYPE_P (stept)
10240 : 26 : ? build_real_from_wide (stept, lup_mul,
10241 : : UNSIGNED)
10242 : 29938 : : build_int_cstu (stept, lup_mul));
10243 : : }
10244 : : else
10245 : : {
10246 : : if (SCALAR_FLOAT_TYPE_P (stept))
10247 : : {
10248 : : tree tem = build_int_cst (integer_type_node, vf);
10249 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
10250 : : }
10251 : : else
10252 : : lupdate_mul = build_int_cst (stept, vf);
10253 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
10254 : : lupdate_mul);
10255 : : }
10256 : : }
10257 : 15082 : tree peel_mul = NULL_TREE;
10258 : 15082 : if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10259 : : {
10260 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
10261 : 0 : peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10262 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10263 : : else
10264 : 0 : peel_mul = gimple_convert (&init_stmts, stept,
10265 : : LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10266 : 0 : peel_mul = gimple_build_vector_from_val (&init_stmts,
10267 : : step_vectype, peel_mul);
10268 : :
10269 : : /* If early break then we have to create a new PHI which we can use as
10270 : : an offset to adjust the induction reduction in early exits.
10271 : :
10272 : : This is because when peeling for alignment using masking, the first
10273 : : few elements of the vector can be inactive. As such if we find the
10274 : : entry in the first iteration we have adjust the starting point of
10275 : : the scalar code.
10276 : :
10277 : : We do this by creating a new scalar PHI that keeps track of whether
10278 : : we are the first iteration of the loop (with the additional masking)
10279 : : or whether we have taken a loop iteration already.
10280 : :
10281 : : The generated sequence:
10282 : :
10283 : : pre-header:
10284 : : bb1:
10285 : : i_1 = <number of leading inactive elements>
10286 : :
10287 : : header:
10288 : : bb2:
10289 : : i_2 = PHI <i_1(bb1), 0(latch)>
10290 : : …
10291 : :
10292 : : early-exit:
10293 : : bb3:
10294 : : i_3 = iv_step * i_2 + PHI<vector-iv>
10295 : :
10296 : : The first part of the adjustment to create i_1 and i_2 are done here
10297 : : and the last part creating i_3 is done in
10298 : : vectorizable_live_operations when the induction extraction is
10299 : : materialized. */
10300 : 0 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10301 : 0 : && !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
10302 : : {
10303 : 0 : auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10304 : 0 : tree ty_skip_niters = TREE_TYPE (skip_niters);
10305 : 0 : tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters,
10306 : : vect_scalar_var,
10307 : : "pfa_iv_offset");
10308 : 0 : gphi *nphi = create_phi_node (break_lhs_phi, bb);
10309 : 0 : add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION);
10310 : 0 : add_phi_arg (nphi, build_zero_cst (ty_skip_niters),
10311 : : loop_latch_edge (iv_loop), UNKNOWN_LOCATION);
10312 : :
10313 : 0 : LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo) = PHI_RESULT (nphi);
10314 : : }
10315 : : }
10316 : 15082 : tree step_mul = NULL_TREE;
10317 : 15082 : unsigned ivn;
10318 : 15082 : auto_vec<tree> vec_steps;
10319 : 30662 : for (ivn = 0; ivn < nivs; ++ivn)
10320 : : {
10321 : 15580 : gimple_seq stmts = NULL;
10322 : 15580 : bool invariant = true;
10323 : 15580 : if (nunits.is_constant (&const_nunits))
10324 : : {
10325 : 15580 : tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10326 : 15580 : tree_vector_builder init_elts (vectype, const_nunits, 1);
10327 : 15580 : tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10328 : 97856 : for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10329 : : {
10330 : : /* The scalar steps of the IVs. */
10331 : 82276 : tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10332 : 82276 : elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10333 : 82276 : step_elts.quick_push (elt);
10334 : 82276 : if (!init_node)
10335 : : {
10336 : : /* The scalar inits of the IVs if not vectorized. */
10337 : 81598 : elt = inits[(ivn*const_nunits + eltn) % group_size];
10338 : 81598 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
10339 : 81598 : TREE_TYPE (elt)))
10340 : 214 : elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10341 : 214 : TREE_TYPE (vectype), elt);
10342 : 81598 : init_elts.quick_push (elt);
10343 : : }
10344 : : /* The number of steps to add to the initial values. */
10345 : 82276 : unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10346 : 164552 : mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10347 : 164456 : ? build_real_from_wide (stept, mul_elt,
10348 : : UNSIGNED)
10349 : 164456 : : build_int_cstu (stept, mul_elt));
10350 : : }
10351 : 15580 : vec_step = gimple_build_vector (&init_stmts, &step_elts);
10352 : 15580 : step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10353 : 15580 : if (!init_node)
10354 : 15464 : vec_init = gimple_build_vector (&init_stmts, &init_elts);
10355 : 15580 : }
10356 : : else
10357 : : {
10358 : : if (init_node)
10359 : : ;
10360 : : else if (INTEGRAL_TYPE_P (TREE_TYPE (steps[0])))
10361 : : {
10362 : : new_name = gimple_convert (&init_stmts, stept, inits[0]);
10363 : : /* Build the initial value directly as a VEC_SERIES_EXPR. */
10364 : : vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
10365 : : step_vectype, new_name, steps[0]);
10366 : : if (!useless_type_conversion_p (vectype, step_vectype))
10367 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10368 : : vectype, vec_init);
10369 : : }
10370 : : else
10371 : : {
10372 : : /* Build:
10373 : : [base, base, base, ...]
10374 : : + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10375 : : gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (steps[0])));
10376 : : gcc_assert (flag_associative_math);
10377 : : gcc_assert (index_vectype != NULL_TREE);
10378 : :
10379 : : tree index = build_index_vector (index_vectype, 0, 1);
10380 : : new_name = gimple_convert (&init_stmts, TREE_TYPE (steps[0]),
10381 : : inits[0]);
10382 : : tree base_vec = gimple_build_vector_from_val (&init_stmts,
10383 : : step_vectype,
10384 : : new_name);
10385 : : tree step_vec = gimple_build_vector_from_val (&init_stmts,
10386 : : step_vectype,
10387 : : steps[0]);
10388 : : vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
10389 : : step_vectype, index);
10390 : : vec_init = gimple_build (&init_stmts, MULT_EXPR,
10391 : : step_vectype, vec_init, step_vec);
10392 : : vec_init = gimple_build (&init_stmts, PLUS_EXPR,
10393 : : step_vectype, vec_init, base_vec);
10394 : : if (!useless_type_conversion_p (vectype, step_vectype))
10395 : : vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10396 : : vectype, vec_init);
10397 : : }
10398 : : /* iv_loop is nested in the loop to be vectorized. Generate:
10399 : : vec_step = [S, S, S, S] */
10400 : : t = unshare_expr (steps[0]);
10401 : : gcc_assert (CONSTANT_CLASS_P (t)
10402 : : || TREE_CODE (t) == SSA_NAME);
10403 : : vec_step = gimple_build_vector_from_val (&init_stmts,
10404 : : step_vectype, t);
10405 : : }
10406 : 15580 : vec_steps.safe_push (vec_step);
10407 : 15580 : if (peel_mul)
10408 : : {
10409 : 0 : if (!step_mul)
10410 : : step_mul = peel_mul;
10411 : : else
10412 : 0 : step_mul = gimple_build (&init_stmts,
10413 : : MINUS_EXPR, step_vectype,
10414 : : step_mul, peel_mul);
10415 : : }
10416 : :
10417 : : /* Create the induction-phi that defines the induction-operand. */
10418 : 15580 : vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10419 : : "vec_iv_");
10420 : 15580 : induction_phi = create_phi_node (vec_dest, iv_loop->header);
10421 : 15580 : induc_def = PHI_RESULT (induction_phi);
10422 : :
10423 : : /* Create the iv update inside the loop */
10424 : 15580 : tree up = vec_step;
10425 : 15580 : if (lupdate_mul)
10426 : : {
10427 : 15464 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10428 : : {
10429 : : /* When we're using loop_len produced by SELEC_VL, the
10430 : : non-final iterations are not always processing VF
10431 : : elements. So vectorize induction variable instead of
10432 : :
10433 : : _21 = vect_vec_iv_.6_22 + { VF, ... };
10434 : :
10435 : : We should generate:
10436 : :
10437 : : _35 = .SELECT_VL (ivtmp_33, VF);
10438 : : vect_cst__22 = [vec_duplicate_expr] _35;
10439 : : _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10440 : 0 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10441 : 0 : tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
10442 : : vectype, 0, 0);
10443 : 0 : if (SCALAR_FLOAT_TYPE_P (stept))
10444 : 0 : expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
10445 : : else
10446 : 0 : expr = gimple_convert (&stmts, stept, len);
10447 : 0 : lupdate_mul = gimple_build_vector_from_val (&stmts, step_vectype,
10448 : : expr);
10449 : 0 : up = gimple_build (&stmts, MULT_EXPR,
10450 : : step_vectype, vec_step, lupdate_mul);
10451 : : }
10452 : : else
10453 : 15464 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10454 : : vec_step, lupdate_mul);
10455 : : }
10456 : 15580 : vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10457 : 15580 : vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, up);
10458 : 15580 : vec_def = gimple_convert (&stmts, vectype, vec_def);
10459 : 15580 : insert_iv_increment (&incr_si, insert_after, stmts);
10460 : 15580 : add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10461 : : UNKNOWN_LOCATION);
10462 : :
10463 : 15580 : if (init_node)
10464 : 116 : vec_init = vect_get_slp_vect_def (init_node, ivn);
10465 : 15580 : if (!nested_in_vect_loop
10466 : 15580 : && step_mul
10467 : 15580 : && !integer_zerop (step_mul))
10468 : : {
10469 : 15054 : gcc_assert (invariant);
10470 : 15054 : vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10471 : 15054 : up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10472 : : vec_step, step_mul);
10473 : 15054 : vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10474 : : vec_def, up);
10475 : 15054 : vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10476 : : }
10477 : :
10478 : : /* Set the arguments of the phi node: */
10479 : 15580 : add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10480 : :
10481 : 15580 : slp_node->push_vec_def (induction_phi);
10482 : : }
10483 : 15082 : if (!nested_in_vect_loop)
10484 : : {
10485 : : /* Fill up to the number of vectors we need for the whole group. */
10486 : 14982 : if (nunits.is_constant (&const_nunits))
10487 : 14982 : nivs = least_common_multiple (group_size, const_nunits) / const_nunits;
10488 : : else
10489 : : nivs = 1;
10490 : 14982 : vec_steps.reserve (nivs-ivn);
10491 : 29991 : for (; ivn < nivs; ++ivn)
10492 : : {
10493 : 27 : slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10494 : 27 : vec_steps.quick_push (vec_steps[0]);
10495 : : }
10496 : : }
10497 : :
10498 : : /* Re-use IVs when we can. We are generating further vector
10499 : : stmts by adding VF' * stride to the IVs generated above. */
10500 : 15082 : if (ivn < nvects)
10501 : : {
10502 : 3420 : if (nunits.is_constant (&const_nunits))
10503 : : {
10504 : 3420 : unsigned vfp = (least_common_multiple (group_size, const_nunits)
10505 : 3420 : / group_size);
10506 : 3420 : lupdate_mul
10507 : 3420 : = build_vector_from_val (step_vectype,
10508 : 3420 : SCALAR_FLOAT_TYPE_P (stept)
10509 : 7 : ? build_real_from_wide (stept,
10510 : 7 : vfp, UNSIGNED)
10511 : 6833 : : build_int_cstu (stept, vfp));
10512 : : }
10513 : : else
10514 : : {
10515 : : if (SCALAR_FLOAT_TYPE_P (stept))
10516 : : {
10517 : : tree tem = build_int_cst (integer_type_node, nunits);
10518 : : lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept, tem);
10519 : : }
10520 : : else
10521 : : lupdate_mul = build_int_cst (stept, nunits);
10522 : : lupdate_mul = gimple_build_vector_from_val (&init_stmts, step_vectype,
10523 : : lupdate_mul);
10524 : : }
10525 : 11086 : for (; ivn < nvects; ++ivn)
10526 : : {
10527 : 7666 : gimple *iv
10528 : 7666 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10529 : 7666 : tree def = gimple_get_lhs (iv);
10530 : 7666 : if (ivn < 2*nivs)
10531 : 3466 : vec_steps[ivn - nivs]
10532 : 3466 : = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10533 : 3466 : vec_steps[ivn - nivs], lupdate_mul);
10534 : 7666 : gimple_seq stmts = NULL;
10535 : 7666 : def = gimple_convert (&stmts, step_vectype, def);
10536 : 22998 : def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10537 : 7666 : def, vec_steps[ivn % nivs]);
10538 : 7666 : def = gimple_convert (&stmts, vectype, def);
10539 : 7666 : if (gimple_code (iv) == GIMPLE_PHI)
10540 : 3466 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10541 : : else
10542 : : {
10543 : 4200 : gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10544 : 4200 : gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10545 : : }
10546 : 7666 : slp_node->push_vec_def (def);
10547 : : }
10548 : : }
10549 : :
10550 : 15082 : new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10551 : 15082 : gcc_assert (!new_bb);
10552 : :
10553 : 15082 : return true;
10554 : 15082 : }
10555 : :
10556 : : /* Function vectorizable_live_operation_1.
10557 : :
10558 : : helper function for vectorizable_live_operation. */
10559 : :
10560 : : static tree
10561 : 5106 : vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10562 : : stmt_vec_info stmt_info, basic_block exit_bb,
10563 : : tree vectype, int ncopies, slp_tree slp_node,
10564 : : tree bitsize, tree bitstart, tree vec_lhs,
10565 : : tree lhs_type, gimple_stmt_iterator *exit_gsi)
10566 : : {
10567 : 5106 : gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10568 : :
10569 : 5106 : tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10570 : 5106 : gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10571 : 10605 : for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10572 : 5499 : SET_PHI_ARG_DEF (phi, i, vec_lhs);
10573 : :
10574 : 5106 : gimple_seq stmts = NULL;
10575 : 5106 : tree new_tree;
10576 : :
10577 : : /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10578 : 5106 : if (integer_zerop (bitstart))
10579 : : {
10580 : 2646 : tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10581 : : vec_lhs_phi, bitsize, bitstart);
10582 : :
10583 : : /* Convert the extracted vector element to the scalar type. */
10584 : 2646 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10585 : : }
10586 : 2460 : else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10587 : : {
10588 : : /* Emit:
10589 : :
10590 : : SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10591 : :
10592 : : where VEC_LHS is the vectorized live-out result and MASK is
10593 : : the loop mask for the final iteration. */
10594 : 0 : gcc_assert (ncopies == 1
10595 : : && (!slp_node || SLP_TREE_LANES (slp_node) == 1));
10596 : 0 : gimple_seq tem = NULL;
10597 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10598 : 0 : tree len = vect_get_loop_len (loop_vinfo, &gsi,
10599 : : &LOOP_VINFO_LENS (loop_vinfo),
10600 : : 1, vectype, 0, 1);
10601 : 0 : gimple_seq_add_seq (&stmts, tem);
10602 : :
10603 : : /* BIAS - 1. */
10604 : 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10605 : 0 : tree bias_minus_one
10606 : 0 : = int_const_binop (MINUS_EXPR,
10607 : 0 : build_int_cst (TREE_TYPE (len), biasval),
10608 : 0 : build_one_cst (TREE_TYPE (len)));
10609 : :
10610 : : /* LAST_INDEX = LEN + (BIAS - 1). */
10611 : 0 : tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10612 : : len, bias_minus_one);
10613 : :
10614 : : /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10615 : 0 : tree scalar_res
10616 : 0 : = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10617 : : vec_lhs_phi, last_index);
10618 : :
10619 : : /* Convert the extracted vector element to the scalar type. */
10620 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10621 : : }
10622 : 2460 : else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10623 : : {
10624 : : /* Emit:
10625 : :
10626 : : SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10627 : :
10628 : : where VEC_LHS is the vectorized live-out result and MASK is
10629 : : the loop mask for the final iteration. */
10630 : 0 : gcc_assert (!slp_node || SLP_TREE_LANES (slp_node) == 1);
10631 : 0 : tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10632 : 0 : gimple_seq tem = NULL;
10633 : 0 : gimple_stmt_iterator gsi = gsi_last (tem);
10634 : 0 : tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10635 : : &LOOP_VINFO_MASKS (loop_vinfo),
10636 : : 1, vectype, 0);
10637 : 0 : tree scalar_res;
10638 : 0 : gimple_seq_add_seq (&stmts, tem);
10639 : :
10640 : 0 : scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10641 : : mask, vec_lhs_phi);
10642 : :
10643 : : /* Convert the extracted vector element to the scalar type. */
10644 : 0 : new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10645 : : }
10646 : : else
10647 : : {
10648 : 2460 : tree bftype = TREE_TYPE (vectype);
10649 : 2460 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10650 : 85 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10651 : 2460 : new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10652 : 2460 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10653 : : &stmts, true, NULL_TREE);
10654 : : }
10655 : :
10656 : 5106 : *exit_gsi = gsi_after_labels (exit_bb);
10657 : 5106 : if (stmts)
10658 : 5106 : gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10659 : :
10660 : 5106 : return new_tree;
10661 : : }
10662 : :
10663 : : /* Function vectorizable_live_operation.
10664 : :
10665 : : STMT_INFO computes a value that is used outside the loop. Check if
10666 : : it can be supported. */
10667 : :
10668 : : bool
10669 : 298170 : vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10670 : : slp_tree slp_node, slp_instance slp_node_instance,
10671 : : int slp_index, bool vec_stmt_p,
10672 : : stmt_vector_for_cost *cost_vec)
10673 : : {
10674 : 298170 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10675 : 298170 : imm_use_iterator imm_iter;
10676 : 298170 : tree lhs, lhs_type, bitsize;
10677 : 298170 : tree vectype = (slp_node
10678 : 298170 : ? SLP_TREE_VECTYPE (slp_node)
10679 : : : STMT_VINFO_VECTYPE (stmt_info));
10680 : 298170 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10681 : 298170 : int ncopies;
10682 : 298170 : gimple *use_stmt;
10683 : 298170 : use_operand_p use_p;
10684 : 298170 : auto_vec<tree> vec_oprnds;
10685 : 298170 : int vec_entry = 0;
10686 : 298170 : poly_uint64 vec_index = 0;
10687 : :
10688 : 298170 : gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10689 : : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10690 : :
10691 : : /* If a stmt of a reduction is live, vectorize it via
10692 : : vect_create_epilog_for_reduction. vectorizable_reduction assessed
10693 : : validity so just trigger the transform here. */
10694 : 300011 : if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10695 : : {
10696 : 58143 : if (!vec_stmt_p)
10697 : : return true;
10698 : : /* For SLP reductions we vectorize the epilogue for all involved stmts
10699 : : together. */
10700 : 22289 : if (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) && slp_index != 0)
10701 : : return true;
10702 : 21991 : stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10703 : 21991 : gcc_assert (reduc_info->is_reduc_info);
10704 : 21991 : if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10705 : 21991 : || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10706 : : return true;
10707 : :
10708 : 21157 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10709 : 21157 : || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10710 : 21153 : vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10711 : : slp_node_instance,
10712 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10713 : :
10714 : : /* If early break we only have to materialize the reduction on the merge
10715 : : block, but we have to find an alternate exit first. */
10716 : 21157 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10717 : : {
10718 : 23 : slp_tree phis_node = slp_node ? slp_node_instance->reduc_phis : NULL;
10719 : 69 : for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10720 : 23 : if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10721 : : {
10722 : 23 : vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
10723 : : phis_node, slp_node_instance,
10724 : : exit);
10725 : 23 : break;
10726 : 23 : }
10727 : 23 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10728 : 4 : vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
10729 : : phis_node, slp_node_instance,
10730 : : LOOP_VINFO_IV_EXIT (loop_vinfo));
10731 : : }
10732 : :
10733 : 21157 : return true;
10734 : : }
10735 : :
10736 : : /* If STMT is not relevant and it is a simple assignment and its inputs are
10737 : : invariant then it can remain in place, unvectorized. The original last
10738 : : scalar value that it computes will be used. */
10739 : 240027 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
10740 : : {
10741 : 0 : gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10742 : 0 : if (dump_enabled_p ())
10743 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
10744 : : "statement is simple and uses invariant. Leaving in "
10745 : : "place.\n");
10746 : 0 : return true;
10747 : : }
10748 : :
10749 : 240027 : if (slp_node)
10750 : : ncopies = 1;
10751 : : else
10752 : 0 : ncopies = vect_get_num_copies (loop_vinfo, vectype);
10753 : :
10754 : 0 : if (slp_node)
10755 : : {
10756 : 240027 : gcc_assert (slp_index >= 0);
10757 : :
10758 : : /* Get the last occurrence of the scalar index from the concatenation of
10759 : : all the slp vectors. Calculate which slp vector it is and the index
10760 : : within. */
10761 : 240027 : int num_scalar = SLP_TREE_LANES (slp_node);
10762 : 240027 : int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10763 : 240027 : poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10764 : :
10765 : : /* Calculate which vector contains the result, and which lane of
10766 : : that vector we need. */
10767 : 240027 : if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10768 : : {
10769 : : if (dump_enabled_p ())
10770 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10771 : : "Cannot determine which vector holds the"
10772 : : " final result.\n");
10773 : : return false;
10774 : : }
10775 : : }
10776 : :
10777 : 240027 : if (!vec_stmt_p)
10778 : : {
10779 : : /* No transformation required. */
10780 : 196862 : if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10781 : : {
10782 : 1 : if (slp_node && SLP_TREE_LANES (slp_node) != 1)
10783 : : {
10784 : 0 : if (dump_enabled_p ())
10785 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10786 : : "can't operate on partial vectors "
10787 : : "because an SLP statement is live after "
10788 : : "the loop.\n");
10789 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10790 : : }
10791 : 1 : else if (ncopies > 1
10792 : 1 : || (slp_node && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1))
10793 : : {
10794 : 0 : if (dump_enabled_p ())
10795 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10796 : : "can't operate on partial vectors "
10797 : : "because ncopies is greater than 1.\n");
10798 : 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10799 : : }
10800 : : else
10801 : : {
10802 : 1 : gcc_assert (ncopies == 1
10803 : : && (!slp_node || SLP_TREE_LANES (slp_node) == 1));
10804 : 1 : if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10805 : : OPTIMIZE_FOR_SPEED))
10806 : 0 : vect_record_loop_mask (loop_vinfo,
10807 : : &LOOP_VINFO_MASKS (loop_vinfo),
10808 : : 1, vectype, NULL);
10809 : 1 : else if (can_vec_extract_var_idx_p (
10810 : 1 : TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10811 : 0 : vect_record_loop_len (loop_vinfo,
10812 : : &LOOP_VINFO_LENS (loop_vinfo),
10813 : : 1, vectype, 1);
10814 : : else
10815 : : {
10816 : 1 : if (dump_enabled_p ())
10817 : 0 : dump_printf_loc (
10818 : 0 : MSG_MISSED_OPTIMIZATION, vect_location,
10819 : : "can't operate on partial vectors "
10820 : : "because the target doesn't support extract "
10821 : : "last reduction.\n");
10822 : 1 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10823 : : }
10824 : : }
10825 : : }
10826 : : /* ??? Enable for loop costing as well. */
10827 : 1 : if (!loop_vinfo)
10828 : 99762 : record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10829 : : 0, vect_epilogue);
10830 : 196862 : return true;
10831 : : }
10832 : :
10833 : : /* Use the lhs of the original scalar statement. */
10834 : 43165 : gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10835 : 43165 : if (dump_enabled_p ())
10836 : 1436 : dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10837 : : "stmt %G", stmt);
10838 : :
10839 : 43165 : lhs = gimple_get_lhs (stmt);
10840 : 43165 : lhs_type = TREE_TYPE (lhs);
10841 : :
10842 : 43165 : bitsize = vector_element_bits_tree (vectype);
10843 : :
10844 : : /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10845 : 43165 : tree vec_lhs, vec_lhs0, bitstart;
10846 : 43165 : gimple *vec_stmt, *vec_stmt0;
10847 : 43165 : if (slp_node)
10848 : : {
10849 : 43165 : gcc_assert (!loop_vinfo
10850 : : || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10851 : : && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10852 : : || SLP_TREE_LANES (slp_node) == 1));
10853 : :
10854 : : /* Get the correct slp vectorized stmt. */
10855 : 43165 : vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10856 : 43165 : vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10857 : :
10858 : : /* In case we need to early break vectorize also get the first stmt. */
10859 : 43165 : vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10860 : 43165 : vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10861 : :
10862 : : /* Get entry to use. */
10863 : 43165 : bitstart = bitsize_int (vec_index);
10864 : 43165 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10865 : : }
10866 : : else
10867 : : {
10868 : : /* For multiple copies, get the last copy. */
10869 : 0 : vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10870 : 0 : vec_lhs = gimple_get_lhs (vec_stmt);
10871 : :
10872 : : /* In case we need to early break vectorize also get the first stmt. */
10873 : 0 : vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10874 : 0 : vec_lhs0 = gimple_get_lhs (vec_stmt0);
10875 : :
10876 : : /* Get the last lane in the vector. */
10877 : 0 : bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10878 : : }
10879 : :
10880 : 43165 : if (loop_vinfo)
10881 : : {
10882 : : /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10883 : : requirement, insert one phi node for it. It looks like:
10884 : : loop;
10885 : : BB:
10886 : : # lhs' = PHI <lhs>
10887 : : ==>
10888 : : loop;
10889 : : BB:
10890 : : # vec_lhs' = PHI <vec_lhs>
10891 : : new_tree = lane_extract <vec_lhs', ...>;
10892 : : lhs' = new_tree; */
10893 : :
10894 : 5155 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10895 : : /* Check if we have a loop where the chosen exit is not the main exit,
10896 : : in these cases for an early break we restart the iteration the vector code
10897 : : did. For the live values we want the value at the start of the iteration
10898 : : rather than at the end. */
10899 : 5155 : edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10900 : 5155 : bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10901 : 22016 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10902 : 16861 : if (!is_gimple_debug (use_stmt)
10903 : 16861 : && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10904 : 5106 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10905 : : {
10906 : 5106 : edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10907 : 5106 : phi_arg_index_from_use (use_p));
10908 : 5106 : gcc_assert (loop_exit_edge_p (loop, e));
10909 : 5106 : bool main_exit_edge = e == main_e;
10910 : 5106 : tree tmp_vec_lhs = vec_lhs;
10911 : 5106 : tree tmp_bitstart = bitstart;
10912 : :
10913 : : /* For early exit where the exit is not in the BB that leads
10914 : : to the latch then we're restarting the iteration in the
10915 : : scalar loop. So get the first live value. */
10916 : 12832 : bool early_break_first_element_p
10917 : 5106 : = (all_exits_as_early_p || !main_exit_edge)
10918 : 5106 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def;
10919 : 2620 : if (early_break_first_element_p)
10920 : : {
10921 : 2620 : tmp_vec_lhs = vec_lhs0;
10922 : 2620 : tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10923 : : }
10924 : :
10925 : 5106 : gimple_stmt_iterator exit_gsi;
10926 : 5106 : tree new_tree
10927 : 5106 : = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10928 : : e->dest, vectype, ncopies,
10929 : : slp_node, bitsize,
10930 : : tmp_bitstart, tmp_vec_lhs,
10931 : : lhs_type, &exit_gsi);
10932 : :
10933 : 5106 : auto gsi = gsi_for_stmt (use_stmt);
10934 : 5106 : if (early_break_first_element_p
10935 : 2620 : && LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
10936 : : {
10937 : 0 : tree step_expr
10938 : : = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10939 : 0 : tree break_lhs_phi
10940 : : = LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo);
10941 : 0 : tree ty_skip_niters = TREE_TYPE (break_lhs_phi);
10942 : 0 : gimple_seq iv_stmts = NULL;
10943 : :
10944 : : /* Now create the PHI for the outside loop usage to
10945 : : retrieve the value for the offset counter. */
10946 : 0 : tree rphi_step
10947 : 0 : = gimple_convert (&iv_stmts, ty_skip_niters, step_expr);
10948 : 0 : tree tmp2
10949 : 0 : = gimple_build (&iv_stmts, MULT_EXPR,
10950 : : ty_skip_niters, rphi_step,
10951 : : break_lhs_phi);
10952 : :
10953 : 0 : if (POINTER_TYPE_P (TREE_TYPE (new_tree)))
10954 : : {
10955 : 0 : tmp2 = gimple_convert (&iv_stmts, sizetype, tmp2);
10956 : 0 : tmp2 = gimple_build (&iv_stmts, POINTER_PLUS_EXPR,
10957 : 0 : TREE_TYPE (new_tree), new_tree,
10958 : : tmp2);
10959 : : }
10960 : : else
10961 : : {
10962 : 0 : tmp2 = gimple_convert (&iv_stmts, TREE_TYPE (new_tree),
10963 : : tmp2);
10964 : 0 : tmp2 = gimple_build (&iv_stmts, PLUS_EXPR,
10965 : 0 : TREE_TYPE (new_tree), new_tree,
10966 : : tmp2);
10967 : : }
10968 : :
10969 : 0 : new_tree = tmp2;
10970 : 0 : gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
10971 : : }
10972 : :
10973 : 5106 : tree lhs_phi = gimple_phi_result (use_stmt);
10974 : 5106 : remove_phi_node (&gsi, false);
10975 : 5106 : gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10976 : 5106 : gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10977 : 5106 : break;
10978 : 5155 : }
10979 : :
10980 : : /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10981 : 16910 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10982 : 11755 : gcc_assert (is_gimple_debug (use_stmt)
10983 : 5155 : || flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10984 : : }
10985 : : else
10986 : : {
10987 : : /* For basic-block vectorization simply insert the lane-extraction. */
10988 : 38010 : tree bftype = TREE_TYPE (vectype);
10989 : 38010 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
10990 : 0 : bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10991 : 38010 : tree new_tree = build3 (BIT_FIELD_REF, bftype,
10992 : : vec_lhs, bitsize, bitstart);
10993 : 38010 : gimple_seq stmts = NULL;
10994 : 38010 : new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10995 : : &stmts, true, NULL_TREE);
10996 : 38010 : if (TREE_CODE (new_tree) == SSA_NAME
10997 : 76020 : && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10998 : 2 : SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10999 : 38010 : if (is_a <gphi *> (vec_stmt))
11000 : : {
11001 : 2776 : gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11002 : 2776 : gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11003 : : }
11004 : : else
11005 : : {
11006 : 35234 : gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11007 : 35234 : gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11008 : : }
11009 : :
11010 : : /* Replace use of lhs with newly computed result. If the use stmt is a
11011 : : single arg PHI, just replace all uses of PHI result. It's necessary
11012 : : because lcssa PHI defining lhs may be before newly inserted stmt. */
11013 : 38010 : use_operand_p use_p;
11014 : 38010 : stmt_vec_info use_stmt_info;
11015 : 200834 : FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11016 : 162824 : if (!is_gimple_debug (use_stmt)
11017 : 162824 : && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11018 : 111015 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11019 : : {
11020 : : /* ??? This can happen when the live lane ends up being
11021 : : rooted in a vector construction code-generated by an
11022 : : external SLP node (and code-generation for that already
11023 : : happened). See gcc.dg/vect/bb-slp-47.c.
11024 : : Doing this is what would happen if that vector CTOR
11025 : : were not code-generated yet so it is not too bad.
11026 : : ??? In fact we'd likely want to avoid this situation
11027 : : in the first place. */
11028 : 64478 : if (TREE_CODE (new_tree) == SSA_NAME
11029 : 64211 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11030 : 64211 : && gimple_code (use_stmt) != GIMPLE_PHI
11031 : 120505 : && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11032 : : use_stmt))
11033 : : {
11034 : 267 : if (dump_enabled_p ())
11035 : 20 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11036 : : "Using original scalar computation for "
11037 : : "live lane because use preceeds vector "
11038 : : "def\n");
11039 : 267 : continue;
11040 : : }
11041 : : /* ??? It can also happen that we end up pulling a def into
11042 : : a loop where replacing out-of-loop uses would require
11043 : : a new LC SSA PHI node. Retain the original scalar in
11044 : : those cases as well. PR98064. */
11045 : 65537 : if (TREE_CODE (new_tree) == SSA_NAME
11046 : 63944 : && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11047 : 63944 : && (gimple_bb (use_stmt)->loop_father
11048 : 63944 : != gimple_bb (vec_stmt)->loop_father)
11049 : 71576 : && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11050 : 7632 : gimple_bb (use_stmt)->loop_father))
11051 : : {
11052 : 1593 : if (dump_enabled_p ())
11053 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11054 : : "Using original scalar computation for "
11055 : : "live lane because there is an out-of-loop "
11056 : : "definition for it\n");
11057 : 1593 : continue;
11058 : : }
11059 : 189005 : FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11060 : 63327 : SET_USE (use_p, new_tree);
11061 : 62351 : update_stmt (use_stmt);
11062 : 38010 : }
11063 : : }
11064 : :
11065 : : return true;
11066 : 298170 : }
11067 : :
11068 : : /* Given loop represented by LOOP_VINFO, return true if computation of
11069 : : LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11070 : : otherwise. */
11071 : :
11072 : : static bool
11073 : 56760 : loop_niters_no_overflow (loop_vec_info loop_vinfo)
11074 : : {
11075 : : /* Constant case. */
11076 : 56760 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11077 : : {
11078 : 32889 : tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11079 : 32889 : tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11080 : :
11081 : 32889 : gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11082 : 32889 : gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11083 : 32889 : if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11084 : : return true;
11085 : : }
11086 : :
11087 : 23871 : widest_int max;
11088 : 23871 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11089 : : /* Check the upper bound of loop niters. */
11090 : 23871 : if (get_max_loop_iterations (loop, &max))
11091 : : {
11092 : 23871 : tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11093 : 23871 : signop sgn = TYPE_SIGN (type);
11094 : 23871 : widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11095 : 23871 : if (max < type_max)
11096 : 23683 : return true;
11097 : 23871 : }
11098 : : return false;
11099 : 23871 : }
11100 : :
11101 : : /* Return a mask type with half the number of elements as OLD_TYPE,
11102 : : given that it should have mode NEW_MODE. */
11103 : :
11104 : : tree
11105 : 4250 : vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11106 : : {
11107 : 4250 : poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11108 : 4250 : return build_truth_vector_type_for_mode (nunits, new_mode);
11109 : : }
11110 : :
11111 : : /* Return a mask type with twice as many elements as OLD_TYPE,
11112 : : given that it should have mode NEW_MODE. */
11113 : :
11114 : : tree
11115 : 2029 : vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11116 : : {
11117 : 2029 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11118 : 2029 : return build_truth_vector_type_for_mode (nunits, new_mode);
11119 : : }
11120 : :
11121 : : /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11122 : : contain a sequence of NVECTORS masks that each control a vector of type
11123 : : VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11124 : : these vector masks with the vector version of SCALAR_MASK. */
11125 : :
11126 : : void
11127 : 124 : vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11128 : : unsigned int nvectors, tree vectype, tree scalar_mask)
11129 : : {
11130 : 124 : gcc_assert (nvectors != 0);
11131 : :
11132 : 124 : if (scalar_mask)
11133 : : {
11134 : 22 : scalar_cond_masked_key cond (scalar_mask, nvectors);
11135 : 22 : loop_vinfo->scalar_cond_masked_set.add (cond);
11136 : : }
11137 : :
11138 : 124 : masks->mask_set.add (std::make_pair (vectype, nvectors));
11139 : 124 : }
11140 : :
11141 : : /* Given a complete set of masks MASKS, extract mask number INDEX
11142 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11143 : : where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11144 : :
11145 : : See the comment above vec_loop_masks for more details about the mask
11146 : : arrangement. */
11147 : :
11148 : : tree
11149 : 89 : vect_get_loop_mask (loop_vec_info loop_vinfo,
11150 : : gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11151 : : unsigned int nvectors, tree vectype, unsigned int index)
11152 : : {
11153 : 89 : if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11154 : : == vect_partial_vectors_while_ult)
11155 : : {
11156 : 0 : rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11157 : 0 : tree mask_type = rgm->type;
11158 : :
11159 : : /* Populate the rgroup's mask array, if this is the first time we've
11160 : : used it. */
11161 : 0 : if (rgm->controls.is_empty ())
11162 : : {
11163 : 0 : rgm->controls.safe_grow_cleared (nvectors, true);
11164 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
11165 : : {
11166 : 0 : tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11167 : : /* Provide a dummy definition until the real one is available. */
11168 : 0 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11169 : 0 : rgm->controls[i] = mask;
11170 : : }
11171 : : }
11172 : :
11173 : 0 : tree mask = rgm->controls[index];
11174 : 0 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11175 : 0 : TYPE_VECTOR_SUBPARTS (vectype)))
11176 : : {
11177 : : /* A loop mask for data type X can be reused for data type Y
11178 : : if X has N times more elements than Y and if Y's elements
11179 : : are N times bigger than X's. In this case each sequence
11180 : : of N elements in the loop mask will be all-zero or all-one.
11181 : : We can then view-convert the mask so that each sequence of
11182 : : N elements is replaced by a single element. */
11183 : 0 : gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11184 : : TYPE_VECTOR_SUBPARTS (vectype)));
11185 : 0 : gimple_seq seq = NULL;
11186 : 0 : mask_type = truth_type_for (vectype);
11187 : 0 : mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11188 : 0 : if (seq)
11189 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11190 : : }
11191 : 0 : return mask;
11192 : : }
11193 : 89 : else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11194 : : == vect_partial_vectors_avx512)
11195 : : {
11196 : : /* The number of scalars per iteration and the number of vectors are
11197 : : both compile-time constants. */
11198 : 89 : unsigned int nscalars_per_iter
11199 : 89 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11200 : 89 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11201 : :
11202 : 89 : rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11203 : :
11204 : : /* The stored nV is dependent on the mask type produced. */
11205 : 89 : gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11206 : : TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11207 : : == rgm->factor);
11208 : 89 : nvectors = rgm->factor;
11209 : :
11210 : : /* Populate the rgroup's mask array, if this is the first time we've
11211 : : used it. */
11212 : 89 : if (rgm->controls.is_empty ())
11213 : : {
11214 : 10 : rgm->controls.safe_grow_cleared (nvectors, true);
11215 : 21 : for (unsigned int i = 0; i < nvectors; ++i)
11216 : : {
11217 : 11 : tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11218 : : /* Provide a dummy definition until the real one is available. */
11219 : 11 : SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11220 : 11 : rgm->controls[i] = mask;
11221 : : }
11222 : : }
11223 : 89 : if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11224 : : TYPE_VECTOR_SUBPARTS (vectype)))
11225 : 73 : return rgm->controls[index];
11226 : :
11227 : : /* Split the vector if needed. Since we are dealing with integer mode
11228 : : masks with AVX512 we can operate on the integer representation
11229 : : performing the whole vector shifting. */
11230 : 16 : unsigned HOST_WIDE_INT factor;
11231 : 16 : bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11232 : 16 : TYPE_VECTOR_SUBPARTS (vectype), &factor);
11233 : 0 : gcc_assert (ok);
11234 : 16 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11235 : 16 : tree mask_type = truth_type_for (vectype);
11236 : 16 : gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11237 : 16 : unsigned vi = index / factor;
11238 : 16 : unsigned vpart = index % factor;
11239 : 16 : tree vec = rgm->controls[vi];
11240 : 16 : gimple_seq seq = NULL;
11241 : 16 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11242 : 16 : lang_hooks.types.type_for_mode
11243 : 16 : (TYPE_MODE (rgm->type), 1), vec);
11244 : : /* For integer mode masks simply shift the right bits into position. */
11245 : 16 : if (vpart != 0)
11246 : 12 : vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11247 : : build_int_cst (integer_type_node,
11248 : 24 : (TYPE_VECTOR_SUBPARTS (vectype)
11249 : 12 : * vpart)));
11250 : 16 : vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11251 : 16 : (TYPE_MODE (mask_type), 1), vec);
11252 : 16 : vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11253 : 16 : if (seq)
11254 : 16 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11255 : 16 : return vec;
11256 : : }
11257 : : else
11258 : 0 : gcc_unreachable ();
11259 : : }
11260 : :
11261 : : /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11262 : : lengths for controlling an operation on VECTYPE. The operation splits
11263 : : each element of VECTYPE into FACTOR separate subelements, measuring the
11264 : : length as a number of these subelements. */
11265 : :
11266 : : void
11267 : 0 : vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11268 : : unsigned int nvectors, tree vectype, unsigned int factor)
11269 : : {
11270 : 0 : gcc_assert (nvectors != 0);
11271 : 0 : if (lens->length () < nvectors)
11272 : 0 : lens->safe_grow_cleared (nvectors, true);
11273 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
11274 : :
11275 : : /* The number of scalars per iteration, scalar occupied bytes and
11276 : : the number of vectors are both compile-time constants. */
11277 : 0 : unsigned int nscalars_per_iter
11278 : 0 : = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11279 : 0 : LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11280 : :
11281 : 0 : if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11282 : : {
11283 : : /* For now, we only support cases in which all loads and stores fall back
11284 : : to VnQI or none do. */
11285 : 0 : gcc_assert (!rgl->max_nscalars_per_iter
11286 : : || (rgl->factor == 1 && factor == 1)
11287 : : || (rgl->max_nscalars_per_iter * rgl->factor
11288 : : == nscalars_per_iter * factor));
11289 : 0 : rgl->max_nscalars_per_iter = nscalars_per_iter;
11290 : 0 : rgl->type = vectype;
11291 : 0 : rgl->factor = factor;
11292 : : }
11293 : 0 : }
11294 : :
11295 : : /* Given a complete set of lengths LENS, extract length number INDEX
11296 : : for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11297 : : where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11298 : : multipled by the number of elements that should be processed.
11299 : : Insert any set-up statements before GSI. */
11300 : :
11301 : : tree
11302 : 0 : vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11303 : : vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11304 : : unsigned int index, unsigned int factor)
11305 : : {
11306 : 0 : rgroup_controls *rgl = &(*lens)[nvectors - 1];
11307 : 0 : bool use_bias_adjusted_len =
11308 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11309 : :
11310 : : /* Populate the rgroup's len array, if this is the first time we've
11311 : : used it. */
11312 : 0 : if (rgl->controls.is_empty ())
11313 : : {
11314 : 0 : rgl->controls.safe_grow_cleared (nvectors, true);
11315 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
11316 : : {
11317 : 0 : tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11318 : 0 : gcc_assert (len_type != NULL_TREE);
11319 : :
11320 : 0 : tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11321 : :
11322 : : /* Provide a dummy definition until the real one is available. */
11323 : 0 : SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11324 : 0 : rgl->controls[i] = len;
11325 : :
11326 : 0 : if (use_bias_adjusted_len)
11327 : : {
11328 : 0 : gcc_assert (i == 0);
11329 : 0 : tree adjusted_len =
11330 : 0 : make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11331 : 0 : SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11332 : 0 : rgl->bias_adjusted_ctrl = adjusted_len;
11333 : : }
11334 : : }
11335 : : }
11336 : :
11337 : 0 : if (use_bias_adjusted_len)
11338 : 0 : return rgl->bias_adjusted_ctrl;
11339 : :
11340 : 0 : tree loop_len = rgl->controls[index];
11341 : 0 : if (rgl->factor == 1 && factor == 1)
11342 : : {
11343 : 0 : poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11344 : 0 : poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11345 : 0 : if (maybe_ne (nunits1, nunits2))
11346 : : {
11347 : : /* A loop len for data type X can be reused for data type Y
11348 : : if X has N times more elements than Y and if Y's elements
11349 : : are N times bigger than X's. */
11350 : 0 : gcc_assert (multiple_p (nunits1, nunits2));
11351 : 0 : factor = exact_div (nunits1, nunits2).to_constant ();
11352 : 0 : tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11353 : 0 : gimple_seq seq = NULL;
11354 : 0 : loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11355 : 0 : build_int_cst (iv_type, factor));
11356 : 0 : if (seq)
11357 : 0 : gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11358 : : }
11359 : : }
11360 : : return loop_len;
11361 : : }
11362 : :
11363 : : /* Generate the tree for the loop len mask and return it. Given the lens,
11364 : : nvectors, vectype, index and factor to gen the len mask as below.
11365 : :
11366 : : tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
11367 : : */
11368 : : tree
11369 : 0 : vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11370 : : gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
11371 : : unsigned int nvectors, tree vectype, tree stmt,
11372 : : unsigned int index, unsigned int factor)
11373 : : {
11374 : 0 : tree all_one_mask = build_all_ones_cst (vectype);
11375 : 0 : tree all_zero_mask = build_zero_cst (vectype);
11376 : 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
11377 : : factor);
11378 : 0 : tree bias = build_int_cst (intQI_type_node,
11379 : 0 : LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
11380 : 0 : tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
11381 : 0 : gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
11382 : : all_one_mask, all_zero_mask, len,
11383 : : bias);
11384 : 0 : gimple_call_set_lhs (call, len_mask);
11385 : 0 : gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
11386 : :
11387 : 0 : return len_mask;
11388 : : }
11389 : :
11390 : : /* Scale profiling counters by estimation for LOOP which is vectorized
11391 : : by factor VF.
11392 : : If FLAT is true, the loop we started with had unrealistically flat
11393 : : profile. */
11394 : :
11395 : : static void
11396 : 56760 : scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11397 : : {
11398 : : /* For flat profiles do not scale down proportionally by VF and only
11399 : : cap by known iteration count bounds. */
11400 : 56760 : if (flat)
11401 : : {
11402 : 32214 : if (dump_file && (dump_flags & TDF_DETAILS))
11403 : 4776 : fprintf (dump_file,
11404 : : "Vectorized loop profile seems flat; not scaling iteration "
11405 : : "count down by the vectorization factor %i\n", vf);
11406 : 32214 : scale_loop_profile (loop, profile_probability::always (),
11407 : : get_likely_max_loop_iterations_int (loop));
11408 : 32214 : return;
11409 : : }
11410 : : /* Loop body executes VF fewer times and exit increases VF times. */
11411 : 24546 : profile_count entry_count = loop_preheader_edge (loop)->count ();
11412 : :
11413 : : /* If we have unreliable loop profile avoid dropping entry
11414 : : count bellow header count. This can happen since loops
11415 : : has unrealistically low trip counts. */
11416 : 24546 : while (vf > 1
11417 : 25693 : && loop->header->count > entry_count
11418 : 52236 : && loop->header->count < entry_count * vf)
11419 : : {
11420 : 1997 : if (dump_file && (dump_flags & TDF_DETAILS))
11421 : 152 : fprintf (dump_file,
11422 : : "Vectorization factor %i seems too large for profile "
11423 : : "prevoiusly believed to be consistent; reducing.\n", vf);
11424 : 1997 : vf /= 2;
11425 : : }
11426 : :
11427 : 24546 : if (entry_count.nonzero_p ())
11428 : 24546 : set_edge_probability_and_rescale_others
11429 : 24546 : (exit_e,
11430 : 24546 : entry_count.probability_in (loop->header->count / vf));
11431 : : /* Avoid producing very large exit probability when we do not have
11432 : : sensible profile. */
11433 : 0 : else if (exit_e->probability < profile_probability::always () / (vf * 2))
11434 : 0 : set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11435 : 24546 : loop->latch->count = single_pred_edge (loop->latch)->count ();
11436 : :
11437 : 24546 : scale_loop_profile (loop, profile_probability::always () / vf,
11438 : : get_likely_max_loop_iterations_int (loop));
11439 : : }
11440 : :
11441 : : /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11442 : : in the hash_map with its corresponding values. */
11443 : :
11444 : : static tree
11445 : 7423 : find_in_mapping (tree t, void *context)
11446 : : {
11447 : 7423 : hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11448 : :
11449 : 7423 : tree *value = mapping->get (t);
11450 : 7423 : return value ? *value : t;
11451 : : }
11452 : :
11453 : : /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11454 : : original loop that has now been vectorized.
11455 : :
11456 : : The inits of the data_references need to be advanced with the number of
11457 : : iterations of the main loop. This has been computed in vect_do_peeling and
11458 : : is stored in parameter ADVANCE. We first restore the data_references
11459 : : initial offset with the values recored in ORIG_DRS_INIT.
11460 : :
11461 : : Since the loop_vec_info of this EPILOGUE was constructed for the original
11462 : : loop, its stmt_vec_infos all point to the original statements. These need
11463 : : to be updated to point to their corresponding copies as well as the SSA_NAMES
11464 : : in their PATTERN_DEF_SEQs and RELATED_STMTs.
11465 : :
11466 : : The data_reference's connections also need to be updated. Their
11467 : : corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11468 : : stmt_vec_infos, their statements need to point to their corresponding copy,
11469 : : if they are gather loads or scatter stores then their reference needs to be
11470 : : updated to point to its corresponding copy. */
11471 : :
11472 : : static void
11473 : 6509 : update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11474 : : {
11475 : 6509 : loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11476 : 6509 : auto_vec<gimple *> stmt_worklist;
11477 : 6509 : hash_map<tree,tree> mapping;
11478 : 6509 : gimple *orig_stmt, *new_stmt;
11479 : 6509 : gimple_stmt_iterator epilogue_gsi;
11480 : 6509 : gphi_iterator epilogue_phi_gsi;
11481 : 6509 : stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11482 : 6509 : basic_block *epilogue_bbs = get_loop_body (epilogue);
11483 : 6509 : unsigned i;
11484 : :
11485 : 6509 : free (LOOP_VINFO_BBS (epilogue_vinfo));
11486 : 6509 : LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11487 : 6509 : LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
11488 : :
11489 : : /* The EPILOGUE loop is a copy of the original loop so they share the same
11490 : : gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11491 : : point to the copied statements. We also create a mapping of all LHS' in
11492 : : the original loop and all the LHS' in the EPILOGUE and create worklists to
11493 : : update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11494 : 19527 : for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11495 : : {
11496 : 13018 : for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11497 : 33855 : !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11498 : : {
11499 : 20837 : new_stmt = epilogue_phi_gsi.phi ();
11500 : :
11501 : 20837 : gcc_assert (gimple_uid (new_stmt) > 0);
11502 : 20837 : stmt_vinfo
11503 : 20837 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11504 : :
11505 : 20837 : orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11506 : 20837 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11507 : :
11508 : 41674 : mapping.put (gimple_phi_result (orig_stmt),
11509 : 20837 : gimple_phi_result (new_stmt));
11510 : : /* PHI nodes can not have patterns or related statements. */
11511 : 20837 : gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11512 : : && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11513 : : }
11514 : :
11515 : 26036 : for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11516 : 132064 : !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11517 : : {
11518 : 119046 : new_stmt = gsi_stmt (epilogue_gsi);
11519 : 119046 : if (is_gimple_debug (new_stmt))
11520 : 21632 : continue;
11521 : :
11522 : 97414 : gcc_assert (gimple_uid (new_stmt) > 0);
11523 : 97414 : stmt_vinfo
11524 : 97414 : = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11525 : :
11526 : 97414 : orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11527 : 97414 : STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11528 : :
11529 : 97414 : if (tree old_lhs = gimple_get_lhs (orig_stmt))
11530 : 90838 : mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11531 : :
11532 : 97414 : if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11533 : : {
11534 : : gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11535 : : for (gimple_stmt_iterator gsi = gsi_start (seq);
11536 : 3971 : !gsi_end_p (gsi); gsi_next (&gsi))
11537 : 2533 : stmt_worklist.safe_push (gsi_stmt (gsi));
11538 : : }
11539 : :
11540 : 97414 : related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11541 : 97414 : if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11542 : : {
11543 : 1754 : gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11544 : 1754 : stmt_worklist.safe_push (stmt);
11545 : : /* Set BB such that the assert in
11546 : : 'get_initial_defs_for_reduction' is able to determine that
11547 : : the BB of the related stmt is inside this loop. */
11548 : 1754 : gimple_set_bb (stmt,
11549 : : gimple_bb (new_stmt));
11550 : 1754 : related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11551 : 1754 : gcc_assert (related_vinfo == NULL
11552 : : || related_vinfo == stmt_vinfo);
11553 : : }
11554 : : }
11555 : : }
11556 : :
11557 : : /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11558 : : using the original main loop and thus need to be updated to refer to the
11559 : : cloned variables used in the epilogue. */
11560 : 10796 : for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11561 : : {
11562 : 4287 : gimple *stmt = stmt_worklist[i];
11563 : 4287 : tree *new_op;
11564 : :
11565 : 11092 : for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11566 : : {
11567 : 6805 : tree op = gimple_op (stmt, j);
11568 : 6805 : if ((new_op = mapping.get(op)))
11569 : 1707 : gimple_set_op (stmt, j, *new_op);
11570 : : else
11571 : : {
11572 : : /* PR92429: The last argument of simplify_replace_tree disables
11573 : : folding when replacing arguments. This is required as
11574 : : otherwise you might end up with different statements than the
11575 : : ones analyzed in vect_loop_analyze, leading to different
11576 : : vectorization. */
11577 : 5098 : op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11578 : : &find_in_mapping, &mapping, false);
11579 : 5098 : gimple_set_op (stmt, j, op);
11580 : : }
11581 : : }
11582 : : }
11583 : :
11584 : 6509 : struct data_reference *dr;
11585 : 6509 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11586 : 28060 : FOR_EACH_VEC_ELT (datarefs, i, dr)
11587 : : {
11588 : 21551 : orig_stmt = DR_STMT (dr);
11589 : 21551 : gcc_assert (gimple_uid (orig_stmt) > 0);
11590 : 21551 : stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11591 : : /* Data references for gather loads and scatter stores do not use the
11592 : : updated offset we set using ADVANCE. Instead we have to make sure the
11593 : : reference in the data references point to the corresponding copy of
11594 : : the original in the epilogue. Make sure to update both
11595 : : gather/scatters recognized by dataref analysis and also other
11596 : : refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11597 : 21551 : auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11598 : 21551 : if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11599 : 21375 : || STMT_VINFO_STRIDED_P (vstmt_vinfo)
11600 : 19736 : || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11601 : : {
11602 : : /* ??? As we copy epilogues from the main loop incremental
11603 : : replacement from an already replaced DR_REF from vectorizing
11604 : : the first epilogue will fail. */
11605 : 2055 : DR_REF (dr)
11606 : 2055 : = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11607 : : &find_in_mapping, &mapping);
11608 : 2055 : DR_BASE_ADDRESS (dr)
11609 : 2055 : = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11610 : : &find_in_mapping, &mapping);
11611 : : }
11612 : 21551 : DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11613 : : }
11614 : :
11615 : : /* Advance data_reference's with the number of iterations of the previous
11616 : : loop and its prologue. */
11617 : 6509 : vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11618 : :
11619 : : /* Remember the advancement made. */
11620 : 6509 : LOOP_VINFO_DRS_ADVANCED_BY (epilogue_vinfo) = advance;
11621 : :
11622 : 6509 : epilogue_vinfo->shared->datarefs_copy.release ();
11623 : 6509 : epilogue_vinfo->shared->save_datarefs ();
11624 : 6509 : }
11625 : :
11626 : : /* When vectorizing early break statements instructions that happen before
11627 : : the early break in the current BB need to be moved to after the early
11628 : : break. This function deals with that and assumes that any validity
11629 : : checks has already been performed.
11630 : :
11631 : : While moving the instructions if it encounters a VUSE or VDEF it then
11632 : : corrects the VUSES as it moves the statements along. GDEST is the location
11633 : : in which to insert the new statements. */
11634 : :
11635 : : static void
11636 : 1367 : move_early_exit_stmts (loop_vec_info loop_vinfo)
11637 : : {
11638 : 1367 : DUMP_VECT_SCOPE ("move_early_exit_stmts");
11639 : :
11640 : 1367 : if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11641 : 1163 : return;
11642 : :
11643 : : /* Move all stmts that need moving. */
11644 : 204 : basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11645 : 204 : gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
11646 : :
11647 : 204 : tree last_seen_vuse = NULL_TREE;
11648 : 503 : for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11649 : : {
11650 : : /* We have to update crossed degenerate virtual PHIs. Simply
11651 : : elide them. */
11652 : 299 : if (gphi *vphi = dyn_cast <gphi *> (stmt))
11653 : : {
11654 : 7 : tree vdef = gimple_phi_result (vphi);
11655 : 7 : tree vuse = gimple_phi_arg_def (vphi, 0);
11656 : 7 : imm_use_iterator iter;
11657 : 7 : use_operand_p use_p;
11658 : 7 : gimple *use_stmt;
11659 : 23 : FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11660 : : {
11661 : 48 : FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11662 : 16 : SET_USE (use_p, vuse);
11663 : 7 : }
11664 : 7 : auto gsi = gsi_for_stmt (stmt);
11665 : 7 : remove_phi_node (&gsi, true);
11666 : 7 : last_seen_vuse = vuse;
11667 : 7 : continue;
11668 : 7 : }
11669 : :
11670 : : /* Check to see if statement is still required for vect or has been
11671 : : elided. */
11672 : 292 : auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11673 : 292 : if (!stmt_info)
11674 : 0 : continue;
11675 : :
11676 : 292 : if (dump_enabled_p ())
11677 : 147 : dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11678 : :
11679 : 292 : gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11680 : 292 : gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11681 : 584 : last_seen_vuse = gimple_vuse (stmt);
11682 : : }
11683 : :
11684 : : /* Update all the stmts with their new reaching VUSES. */
11685 : 628 : for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11686 : : {
11687 : 178 : if (dump_enabled_p ())
11688 : 142 : dump_printf_loc (MSG_NOTE, vect_location,
11689 : : "updating vuse to %T for load %G",
11690 : : last_seen_vuse, p);
11691 : 178 : gimple_set_vuse (p, last_seen_vuse);
11692 : 178 : update_stmt (p);
11693 : : }
11694 : :
11695 : : /* And update the LC PHIs on exits. */
11696 : 1026 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11697 : 414 : if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11698 : 218 : if (gphi *phi = get_virtual_phi (e->dest))
11699 : 422 : SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11700 : : }
11701 : :
11702 : : /* Function vect_transform_loop.
11703 : :
11704 : : The analysis phase has determined that the loop is vectorizable.
11705 : : Vectorize the loop - created vectorized stmts to replace the scalar
11706 : : stmts in the loop, and update the loop exit condition.
11707 : : Returns scalar epilogue loop if any. */
11708 : :
11709 : : class loop *
11710 : 56760 : vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11711 : : {
11712 : 56760 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11713 : 56760 : class loop *epilogue = NULL;
11714 : 56760 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11715 : 56760 : int nbbs = loop->num_nodes;
11716 : 56760 : int i;
11717 : 56760 : tree niters_vector = NULL_TREE;
11718 : 56760 : tree step_vector = NULL_TREE;
11719 : 56760 : tree niters_vector_mult_vf = NULL_TREE;
11720 : 56760 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11721 : 56760 : unsigned int lowest_vf = constant_lower_bound (vf);
11722 : 56760 : gimple *stmt;
11723 : 56760 : bool check_profitability = false;
11724 : 56760 : unsigned int th;
11725 : 56760 : bool flat = maybe_flat_loop_profile (loop);
11726 : :
11727 : 56760 : DUMP_VECT_SCOPE ("vec_transform_loop");
11728 : :
11729 : 56760 : loop_vinfo->shared->check_datarefs ();
11730 : :
11731 : : /* Use the more conservative vectorization threshold. If the number
11732 : : of iterations is constant assume the cost check has been performed
11733 : : by our caller. If the threshold makes all loops profitable that
11734 : : run at least the (estimated) vectorization factor number of times
11735 : : checking is pointless, too. */
11736 : 56760 : th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11737 : 56760 : if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11738 : : {
11739 : 17152 : if (dump_enabled_p ())
11740 : 158 : dump_printf_loc (MSG_NOTE, vect_location,
11741 : : "Profitability threshold is %d loop iterations.\n",
11742 : : th);
11743 : : check_profitability = true;
11744 : : }
11745 : :
11746 : : /* Make sure there exists a single-predecessor exit bb. Do this before
11747 : : versioning. */
11748 : 56760 : edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11749 : 56760 : if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11750 : : {
11751 : 11399 : split_loop_exit_edge (e, true);
11752 : 11399 : if (dump_enabled_p ())
11753 : 1876 : dump_printf (MSG_NOTE, "split exit edge\n");
11754 : : }
11755 : :
11756 : : /* Version the loop first, if required, so the profitability check
11757 : : comes first. */
11758 : :
11759 : 56760 : if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11760 : : {
11761 : 3578 : class loop *sloop
11762 : 3578 : = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11763 : 3578 : sloop->force_vectorize = false;
11764 : 3578 : check_profitability = false;
11765 : : }
11766 : :
11767 : : /* Make sure there exists a single-predecessor exit bb also on the
11768 : : scalar loop copy. Do this after versioning but before peeling
11769 : : so CFG structure is fine for both scalar and if-converted loop
11770 : : to make slpeel_duplicate_current_defs_from_edges face matched
11771 : : loop closed PHI nodes on the exit. */
11772 : 56760 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11773 : : {
11774 : 5902 : e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11775 : 5902 : if (! single_pred_p (e->dest))
11776 : : {
11777 : 5681 : split_loop_exit_edge (e, true);
11778 : 5681 : if (dump_enabled_p ())
11779 : 1051 : dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11780 : : }
11781 : : }
11782 : :
11783 : 56760 : tree niters = vect_build_loop_niters (loop_vinfo);
11784 : 56760 : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11785 : 56760 : tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11786 : 56760 : bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11787 : 56760 : tree advance;
11788 : 56760 : drs_init_vec orig_drs_init;
11789 : :
11790 : 56760 : epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11791 : : &step_vector, &niters_vector_mult_vf, th,
11792 : : check_profitability, niters_no_overflow,
11793 : : &advance);
11794 : 56760 : if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11795 : 56760 : && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11796 : : {
11797 : : /* Ifcvt duplicates loop preheader, loop body and produces an basic
11798 : : block after loop exit. We need to scale all that. */
11799 : 85 : basic_block preheader
11800 : 85 : = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11801 : 85 : preheader->count
11802 : : = preheader->count.apply_probability
11803 : 85 : (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11804 : 85 : scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11805 : : LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11806 : 85 : LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11807 : : }
11808 : :
11809 : 56760 : if (niters_vector == NULL_TREE)
11810 : : {
11811 : 24737 : if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11812 : 24737 : && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11813 : 50171 : && known_eq (lowest_vf, vf))
11814 : : {
11815 : 24734 : niters_vector
11816 : 24734 : = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11817 : 24734 : LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11818 : 24734 : step_vector = build_one_cst (TREE_TYPE (niters));
11819 : : }
11820 : 703 : else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11821 : 1 : vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11822 : : &step_vector, niters_no_overflow);
11823 : : else
11824 : : /* vect_do_peeling subtracted the number of peeled prologue
11825 : : iterations from LOOP_VINFO_NITERS. */
11826 : 702 : vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11827 : : &niters_vector, &step_vector,
11828 : : niters_no_overflow);
11829 : : }
11830 : :
11831 : : /* 1) Make sure the loop header has exactly two entries
11832 : : 2) Make sure we have a preheader basic block. */
11833 : :
11834 : 56760 : gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11835 : :
11836 : 56760 : split_edge (loop_preheader_edge (loop));
11837 : :
11838 : 56760 : if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11839 : : /* This will deal with any possible peeling. */
11840 : 1 : vect_prepare_for_masked_peels (loop_vinfo);
11841 : :
11842 : : /* Handle any code motion that we need to for early-break vectorization after
11843 : : we've done peeling but just before we start vectorizing. */
11844 : 56760 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11845 : 1367 : move_early_exit_stmts (loop_vinfo);
11846 : :
11847 : : /* Remove existing clobber stmts and prefetches. */
11848 : 173134 : for (i = 0; i < nbbs; i++)
11849 : : {
11850 : 116374 : basic_block bb = bbs[i];
11851 : 1009271 : for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);)
11852 : : {
11853 : 776523 : stmt = gsi_stmt (si);
11854 : 776523 : if (gimple_clobber_p (stmt)
11855 : 776523 : || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
11856 : : {
11857 : 211 : unlink_stmt_vdef (stmt);
11858 : 211 : gsi_remove (&si, true);
11859 : 211 : release_defs (stmt);
11860 : : }
11861 : : else
11862 : 776312 : gsi_next (&si);
11863 : : }
11864 : : }
11865 : :
11866 : : /* Schedule the SLP instances. */
11867 : 56760 : if (!loop_vinfo->slp_instances.is_empty ())
11868 : : {
11869 : 56760 : DUMP_VECT_SCOPE ("scheduling SLP instances");
11870 : 56760 : vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11871 : : }
11872 : :
11873 : : /* Generate the loop invariant statements. */
11874 : 56760 : if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
11875 : : {
11876 : 74 : if (dump_enabled_p ())
11877 : 30 : dump_printf_loc (MSG_NOTE, vect_location,
11878 : : "------>generating loop invariant statements\n");
11879 : 74 : gimple_stmt_iterator gsi;
11880 : 74 : gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
11881 : 74 : gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
11882 : : GSI_CONTINUE_LINKING);
11883 : : }
11884 : :
11885 : : /* Stub out scalar statements that must not survive vectorization and
11886 : : were not picked as relevant in any SLP instance.
11887 : : Doing this here helps with grouped statements, or statements that
11888 : : are involved in patterns. */
11889 : 173134 : for (i = 0; i < nbbs; i++)
11890 : : {
11891 : 116374 : basic_block bb = bbs[i];
11892 : 116374 : stmt_vec_info stmt_info;
11893 : 232748 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11894 : 1510448 : !gsi_end_p (gsi); gsi_next (&gsi))
11895 : : {
11896 : 1394074 : gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11897 : 5187 : if (!call || !gimple_call_internal_p (call))
11898 : 1390024 : continue;
11899 : 4050 : internal_fn ifn = gimple_call_internal_fn (call);
11900 : 4050 : if (ifn == IFN_MASK_LOAD)
11901 : : {
11902 : 539 : tree lhs = gimple_get_lhs (call);
11903 : 539 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11904 : : {
11905 : 0 : tree zero = build_zero_cst (TREE_TYPE (lhs));
11906 : 0 : gimple *new_stmt = gimple_build_assign (lhs, zero);
11907 : 0 : gsi_replace (&gsi, new_stmt, true);
11908 : : }
11909 : : }
11910 : 3511 : else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11911 : : {
11912 : 1578 : tree lhs = gimple_get_lhs (call);
11913 : 1578 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11914 : : {
11915 : 0 : tree else_arg
11916 : 0 : = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11917 : 0 : gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11918 : 0 : gsi_replace (&gsi, new_stmt, true);
11919 : : }
11920 : : }
11921 : 1933 : else if (ifn == IFN_MASK_CALL
11922 : 4 : && (stmt_info = loop_vinfo->lookup_stmt (call))
11923 : 4 : && !STMT_VINFO_RELEVANT_P (stmt_info)
11924 : 1937 : && !STMT_VINFO_LIVE_P (stmt_info))
11925 : : {
11926 : 4 : gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11927 : 4 : loop_vinfo->remove_stmt (stmt_info);
11928 : : }
11929 : : }
11930 : : }
11931 : :
11932 : : /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11933 : : a zero NITERS becomes a nonzero NITERS_VECTOR. */
11934 : 56760 : if (integer_onep (step_vector))
11935 : 56750 : niters_no_overflow = true;
11936 : 56760 : vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11937 : : niters_vector, step_vector, niters_vector_mult_vf,
11938 : 56760 : !niters_no_overflow);
11939 : :
11940 : 56760 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11941 : :
11942 : : /* True if the final iteration might not handle a full vector's
11943 : : worth of scalar iterations. */
11944 : 113520 : bool final_iter_may_be_partial
11945 : 56760 : = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11946 : 56760 : || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
11947 : :
11948 : : /* +1 to convert latch counts to loop iteration counts. */
11949 : 56760 : int bias_for_lowest = 1;
11950 : :
11951 : : /* When we are peeling for gaps then we take away one scalar iteration
11952 : : from the vector loop. Thus we can adjust the upper bound by one
11953 : : scalar iteration. But only when we know the bound applies to the
11954 : : IV exit test which might not be true when we have multiple exits. */
11955 : 56760 : if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11956 : 110461 : bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11957 : :
11958 : 56760 : int bias_for_assumed = bias_for_lowest;
11959 : 56760 : int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11960 : 56760 : if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11961 : : {
11962 : : /* When the amount of peeling is known at compile time, the first
11963 : : iteration will have exactly alignment_npeels active elements.
11964 : : In the worst case it will have at least one. */
11965 : 1 : int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11966 : 1 : bias_for_lowest += lowest_vf - min_first_active;
11967 : 1 : bias_for_assumed += assumed_vf - min_first_active;
11968 : : }
11969 : : /* In these calculations the "- 1" converts loop iteration counts
11970 : : back to latch counts. */
11971 : 56760 : if (loop->any_upper_bound)
11972 : : {
11973 : 56760 : loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11974 : 56760 : loop->nb_iterations_upper_bound
11975 : 56760 : = (final_iter_may_be_partial
11976 : 58137 : ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11977 : 2754 : lowest_vf) - 1
11978 : 55383 : : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11979 : 110766 : lowest_vf) - 1);
11980 : 56760 : if (main_vinfo
11981 : : /* Both peeling for alignment and peeling for gaps can end up
11982 : : with the scalar epilogue running for more than VF-1 iterations. */
11983 : 6509 : && !main_vinfo->peeling_for_alignment
11984 : 6456 : && !main_vinfo->peeling_for_gaps)
11985 : : {
11986 : 6304 : unsigned int bound;
11987 : 6304 : poly_uint64 main_iters
11988 : 6304 : = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11989 : : LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11990 : 6304 : main_iters
11991 : 6304 : = upper_bound (main_iters,
11992 : 6304 : LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11993 : 12608 : if (can_div_away_from_zero_p (main_iters,
11994 : 6304 : LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11995 : : &bound))
11996 : 6304 : loop->nb_iterations_upper_bound
11997 : 6304 : = wi::umin ((bound_wide_int) (bound - 1),
11998 : 6304 : loop->nb_iterations_upper_bound);
11999 : : }
12000 : : }
12001 : 56760 : if (loop->any_likely_upper_bound)
12002 : 56760 : loop->nb_iterations_likely_upper_bound
12003 : 56760 : = (final_iter_may_be_partial
12004 : 58137 : ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12005 : 1377 : + bias_for_lowest, lowest_vf) - 1
12006 : 55383 : : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12007 : 56760 : + bias_for_lowest, lowest_vf) - 1);
12008 : 56760 : if (loop->any_estimate)
12009 : 32565 : loop->nb_iterations_estimate
12010 : 32565 : = (final_iter_may_be_partial
12011 : 33351 : ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12012 : 1572 : assumed_vf) - 1
12013 : 31779 : : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12014 : 64344 : assumed_vf) - 1);
12015 : 56760 : scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12016 : : assumed_vf, flat);
12017 : :
12018 : 56760 : if (dump_enabled_p ())
12019 : : {
12020 : 10020 : if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12021 : : {
12022 : 8741 : dump_printf_loc (MSG_NOTE, vect_location,
12023 : : "LOOP VECTORIZED\n");
12024 : 8741 : if (loop->inner)
12025 : 266 : dump_printf_loc (MSG_NOTE, vect_location,
12026 : : "OUTER LOOP VECTORIZED\n");
12027 : 8741 : dump_printf (MSG_NOTE, "\n");
12028 : : }
12029 : : else
12030 : 1279 : dump_printf_loc (MSG_NOTE, vect_location,
12031 : : "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12032 : 1279 : GET_MODE_NAME (loop_vinfo->vector_mode));
12033 : : }
12034 : :
12035 : : /* Loops vectorized with a variable factor won't benefit from
12036 : : unrolling/peeling. */
12037 : : if (!vf.is_constant ())
12038 : : {
12039 : : loop->unroll = 1;
12040 : : if (dump_enabled_p ())
12041 : : dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12042 : : " variable-length vectorization factor\n");
12043 : : }
12044 : : /* Free SLP instances here because otherwise stmt reference counting
12045 : : won't work. */
12046 : : slp_instance instance;
12047 : 143569 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12048 : 86809 : vect_free_slp_instance (instance);
12049 : 56760 : LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12050 : : /* Clear-up safelen field since its value is invalid after vectorization
12051 : : since vectorized loop can have loop-carried dependencies. */
12052 : 56760 : loop->safelen = 0;
12053 : :
12054 : 56760 : if (epilogue)
12055 : : {
12056 : : /* Accumulate past advancements made. */
12057 : 6509 : if (LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo))
12058 : 77 : advance = fold_build2 (PLUS_EXPR, TREE_TYPE (advance),
12059 : : LOOP_VINFO_DRS_ADVANCED_BY (loop_vinfo),
12060 : : advance);
12061 : 6509 : update_epilogue_loop_vinfo (epilogue, advance);
12062 : :
12063 : 6509 : epilogue->simduid = loop->simduid;
12064 : 6509 : epilogue->force_vectorize = loop->force_vectorize;
12065 : 6509 : epilogue->dont_vectorize = false;
12066 : : }
12067 : :
12068 : 56760 : return epilogue;
12069 : 56760 : }
12070 : :
12071 : : /* The code below is trying to perform simple optimization - revert
12072 : : if-conversion for masked stores, i.e. if the mask of a store is zero
12073 : : do not perform it and all stored value producers also if possible.
12074 : : For example,
12075 : : for (i=0; i<n; i++)
12076 : : if (c[i])
12077 : : {
12078 : : p1[i] += 1;
12079 : : p2[i] = p3[i] +2;
12080 : : }
12081 : : this transformation will produce the following semi-hammock:
12082 : :
12083 : : if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12084 : : {
12085 : : vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12086 : : vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12087 : : MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12088 : : vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12089 : : vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12090 : : MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12091 : : }
12092 : : */
12093 : :
12094 : : void
12095 : 444 : optimize_mask_stores (class loop *loop)
12096 : : {
12097 : 444 : basic_block *bbs = get_loop_body (loop);
12098 : 444 : unsigned nbbs = loop->num_nodes;
12099 : 444 : unsigned i;
12100 : 444 : basic_block bb;
12101 : 444 : class loop *bb_loop;
12102 : 444 : gimple_stmt_iterator gsi;
12103 : 444 : gimple *stmt;
12104 : 444 : auto_vec<gimple *> worklist;
12105 : 444 : auto_purge_vect_location sentinel;
12106 : :
12107 : 444 : vect_location = find_loop_location (loop);
12108 : : /* Pick up all masked stores in loop if any. */
12109 : 1776 : for (i = 0; i < nbbs; i++)
12110 : : {
12111 : 888 : bb = bbs[i];
12112 : 14342 : for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12113 : 12566 : gsi_next (&gsi))
12114 : : {
12115 : 12566 : stmt = gsi_stmt (gsi);
12116 : 12566 : if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12117 : 468 : worklist.safe_push (stmt);
12118 : : }
12119 : : }
12120 : :
12121 : 444 : free (bbs);
12122 : 444 : if (worklist.is_empty ())
12123 : 68 : return;
12124 : :
12125 : : /* Loop has masked stores. */
12126 : 827 : while (!worklist.is_empty ())
12127 : : {
12128 : 451 : gimple *last, *last_store;
12129 : 451 : edge e, efalse;
12130 : 451 : tree mask;
12131 : 451 : basic_block store_bb, join_bb;
12132 : 451 : gimple_stmt_iterator gsi_to;
12133 : 451 : tree vdef, new_vdef;
12134 : 451 : gphi *phi;
12135 : 451 : tree vectype;
12136 : 451 : tree zero;
12137 : :
12138 : 451 : last = worklist.pop ();
12139 : 451 : mask = gimple_call_arg (last, 2);
12140 : 451 : bb = gimple_bb (last);
12141 : : /* Create then_bb and if-then structure in CFG, then_bb belongs to
12142 : : the same loop as if_bb. It could be different to LOOP when two
12143 : : level loop-nest is vectorized and mask_store belongs to the inner
12144 : : one. */
12145 : 451 : e = split_block (bb, last);
12146 : 451 : bb_loop = bb->loop_father;
12147 : 451 : gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12148 : 451 : join_bb = e->dest;
12149 : 451 : store_bb = create_empty_bb (bb);
12150 : 451 : add_bb_to_loop (store_bb, bb_loop);
12151 : 451 : e->flags = EDGE_TRUE_VALUE;
12152 : 451 : efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12153 : : /* Put STORE_BB to likely part. */
12154 : 451 : efalse->probability = profile_probability::likely ();
12155 : 451 : e->probability = efalse->probability.invert ();
12156 : 451 : store_bb->count = efalse->count ();
12157 : 451 : make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12158 : 451 : if (dom_info_available_p (CDI_DOMINATORS))
12159 : 451 : set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12160 : 451 : if (dump_enabled_p ())
12161 : 165 : dump_printf_loc (MSG_NOTE, vect_location,
12162 : : "Create new block %d to sink mask stores.",
12163 : : store_bb->index);
12164 : : /* Create vector comparison with boolean result. */
12165 : 451 : vectype = TREE_TYPE (mask);
12166 : 451 : zero = build_zero_cst (vectype);
12167 : 451 : stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12168 : 451 : gsi = gsi_last_bb (bb);
12169 : 451 : gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12170 : : /* Create new PHI node for vdef of the last masked store:
12171 : : .MEM_2 = VDEF <.MEM_1>
12172 : : will be converted to
12173 : : .MEM.3 = VDEF <.MEM_1>
12174 : : and new PHI node will be created in join bb
12175 : : .MEM_2 = PHI <.MEM_1, .MEM_3>
12176 : : */
12177 : 451 : vdef = gimple_vdef (last);
12178 : 451 : new_vdef = make_ssa_name (gimple_vop (cfun), last);
12179 : 451 : gimple_set_vdef (last, new_vdef);
12180 : 451 : phi = create_phi_node (vdef, join_bb);
12181 : 451 : add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12182 : :
12183 : : /* Put all masked stores with the same mask to STORE_BB if possible. */
12184 : 485 : while (true)
12185 : : {
12186 : 468 : gimple_stmt_iterator gsi_from;
12187 : 468 : gimple *stmt1 = NULL;
12188 : :
12189 : : /* Move masked store to STORE_BB. */
12190 : 468 : last_store = last;
12191 : 468 : gsi = gsi_for_stmt (last);
12192 : 468 : gsi_from = gsi;
12193 : : /* Shift GSI to the previous stmt for further traversal. */
12194 : 468 : gsi_prev (&gsi);
12195 : 468 : gsi_to = gsi_start_bb (store_bb);
12196 : 468 : gsi_move_before (&gsi_from, &gsi_to);
12197 : : /* Setup GSI_TO to the non-empty block start. */
12198 : 468 : gsi_to = gsi_start_bb (store_bb);
12199 : 468 : if (dump_enabled_p ())
12200 : 181 : dump_printf_loc (MSG_NOTE, vect_location,
12201 : : "Move stmt to created bb\n%G", last);
12202 : : /* Move all stored value producers if possible. */
12203 : 3840 : while (!gsi_end_p (gsi))
12204 : : {
12205 : 3839 : tree lhs;
12206 : 3839 : imm_use_iterator imm_iter;
12207 : 3839 : use_operand_p use_p;
12208 : 3839 : bool res;
12209 : :
12210 : : /* Skip debug statements. */
12211 : 3839 : if (is_gimple_debug (gsi_stmt (gsi)))
12212 : : {
12213 : 1 : gsi_prev (&gsi);
12214 : 2527 : continue;
12215 : : }
12216 : 3838 : stmt1 = gsi_stmt (gsi);
12217 : : /* Do not consider statements writing to memory or having
12218 : : volatile operand. */
12219 : 7628 : if (gimple_vdef (stmt1)
12220 : 7628 : || gimple_has_volatile_ops (stmt1))
12221 : : break;
12222 : 3790 : gsi_from = gsi;
12223 : 3790 : gsi_prev (&gsi);
12224 : 3790 : lhs = gimple_get_lhs (stmt1);
12225 : 3790 : if (!lhs)
12226 : : break;
12227 : :
12228 : : /* LHS of vectorized stmt must be SSA_NAME. */
12229 : 3790 : if (TREE_CODE (lhs) != SSA_NAME)
12230 : : break;
12231 : :
12232 : 3790 : if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12233 : : {
12234 : : /* Remove dead scalar statement. */
12235 : 2694 : if (has_zero_uses (lhs))
12236 : : {
12237 : 2526 : gsi_remove (&gsi_from, true);
12238 : 2526 : release_defs (stmt1);
12239 : 2526 : continue;
12240 : : }
12241 : : }
12242 : :
12243 : : /* Check that LHS does not have uses outside of STORE_BB. */
12244 : 1264 : res = true;
12245 : 2195 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12246 : : {
12247 : 1350 : gimple *use_stmt;
12248 : 1350 : use_stmt = USE_STMT (use_p);
12249 : 1350 : if (is_gimple_debug (use_stmt))
12250 : 0 : continue;
12251 : 1350 : if (gimple_bb (use_stmt) != store_bb)
12252 : : {
12253 : : res = false;
12254 : : break;
12255 : : }
12256 : : }
12257 : 1264 : if (!res)
12258 : : break;
12259 : :
12260 : 845 : if (gimple_vuse (stmt1)
12261 : 1273 : && gimple_vuse (stmt1) != gimple_vuse (last_store))
12262 : : break;
12263 : :
12264 : : /* Can move STMT1 to STORE_BB. */
12265 : 845 : if (dump_enabled_p ())
12266 : 385 : dump_printf_loc (MSG_NOTE, vect_location,
12267 : : "Move stmt to created bb\n%G", stmt1);
12268 : 845 : gsi_move_before (&gsi_from, &gsi_to);
12269 : : /* Shift GSI_TO for further insertion. */
12270 : 1690 : gsi_prev (&gsi_to);
12271 : : }
12272 : : /* Put other masked stores with the same mask to STORE_BB. */
12273 : 468 : if (worklist.is_empty ()
12274 : 92 : || gimple_call_arg (worklist.last (), 2) != mask
12275 : 17 : || worklist.last () != stmt1)
12276 : : break;
12277 : 17 : last = worklist.pop ();
12278 : 17 : }
12279 : 902 : add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12280 : : }
12281 : 444 : }
12282 : :
12283 : : /* Decide whether it is possible to use a zero-based induction variable
12284 : : when vectorizing LOOP_VINFO with partial vectors. If it is, return
12285 : : the value that the induction variable must be able to hold in order
12286 : : to ensure that the rgroups eventually have no active vector elements.
12287 : : Return -1 otherwise. */
12288 : :
12289 : : widest_int
12290 : 46 : vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12291 : : {
12292 : 46 : tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12293 : 46 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12294 : 46 : unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12295 : :
12296 : : /* Calculate the value that the induction variable must be able
12297 : : to hit in order to ensure that we end the loop with an all-false mask.
12298 : : This involves adding the maximum number of inactive trailing scalar
12299 : : iterations. */
12300 : 46 : widest_int iv_limit = -1;
12301 : 46 : if (max_loop_iterations (loop, &iv_limit))
12302 : : {
12303 : 46 : if (niters_skip)
12304 : : {
12305 : : /* Add the maximum number of skipped iterations to the
12306 : : maximum iteration count. */
12307 : 0 : if (TREE_CODE (niters_skip) == INTEGER_CST)
12308 : 0 : iv_limit += wi::to_widest (niters_skip);
12309 : : else
12310 : 0 : iv_limit += max_vf - 1;
12311 : : }
12312 : 46 : else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12313 : : /* Make a conservatively-correct assumption. */
12314 : 2 : iv_limit += max_vf - 1;
12315 : :
12316 : : /* IV_LIMIT is the maximum number of latch iterations, which is also
12317 : : the maximum in-range IV value. Round this value down to the previous
12318 : : vector alignment boundary and then add an extra full iteration. */
12319 : 46 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12320 : 46 : iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12321 : : }
12322 : 46 : return iv_limit;
12323 : : }
12324 : :
12325 : : /* For the given rgroup_controls RGC, check whether an induction variable
12326 : : would ever hit a value that produces a set of all-false masks or zero
12327 : : lengths before wrapping around. Return true if it's possible to wrap
12328 : : around before hitting the desirable value, otherwise return false. */
12329 : :
12330 : : bool
12331 : 0 : vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12332 : : {
12333 : 0 : widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12334 : :
12335 : 0 : if (iv_limit == -1)
12336 : : return true;
12337 : :
12338 : 0 : tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12339 : 0 : unsigned int compare_precision = TYPE_PRECISION (compare_type);
12340 : 0 : unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12341 : :
12342 : 0 : if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12343 : : return true;
12344 : :
12345 : : return false;
12346 : 0 : }
|