Branch data Line data Source code
1 : : /* Data References Analysis and Manipulation Utilities for Vectorization.
2 : : Copyright (C) 2003-2026 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : : and Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #define INCLUDE_ALGORITHM
23 : : #include "config.h"
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "predict.h"
32 : : #include "memmodel.h"
33 : : #include "tm_p.h"
34 : : #include "ssa.h"
35 : : #include "optabs-tree.h"
36 : : #include "cgraph.h"
37 : : #include "dumpfile.h"
38 : : #include "pretty-print.h"
39 : : #include "alias.h"
40 : : #include "fold-const.h"
41 : : #include "stor-layout.h"
42 : : #include "tree-eh.h"
43 : : #include "gimplify.h"
44 : : #include "gimple-iterator.h"
45 : : #include "gimplify-me.h"
46 : : #include "tree-ssa-loop-ivopts.h"
47 : : #include "tree-ssa-loop-manip.h"
48 : : #include "tree-ssa-loop.h"
49 : : #include "cfgloop.h"
50 : : #include "tree-scalar-evolution.h"
51 : : #include "tree-vectorizer.h"
52 : : #include "expr.h"
53 : : #include "builtins.h"
54 : : #include "tree-cfg.h"
55 : : #include "tree-hash-traits.h"
56 : : #include "vec-perm-indices.h"
57 : : #include "internal-fn.h"
58 : : #include "gimple-fold.h"
59 : : #include "optabs-query.h"
60 : :
61 : : /* Return true if load- or store-lanes optab OPTAB is implemented for
62 : : COUNT vectors of type VECTYPE. NAME is the name of OPTAB.
63 : :
64 : : If it is implemented and ELSVALS is nonzero store the possible else
65 : : values in the vector it points to. */
66 : :
67 : : static bool
68 : 327746 : vect_lanes_optab_supported_p (const char *name, convert_optab optab,
69 : : tree vectype, unsigned HOST_WIDE_INT count,
70 : : vec<int> *elsvals = nullptr)
71 : : {
72 : 327746 : machine_mode mode, array_mode;
73 : 327746 : bool limit_p;
74 : :
75 : 327746 : mode = TYPE_MODE (vectype);
76 : 327746 : if (!targetm.array_mode (mode, count).exists (&array_mode))
77 : : {
78 : 655492 : poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
79 : 327746 : limit_p = !targetm.array_mode_supported_p (mode, count);
80 : 327746 : if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
81 : : {
82 : 287920 : if (dump_enabled_p ())
83 : 12082 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
84 : : "no array mode for %s[%wu]\n",
85 : 12082 : GET_MODE_NAME (mode), count);
86 : 287920 : return false;
87 : : }
88 : : }
89 : :
90 : 39826 : enum insn_code icode;
91 : 39826 : if ((icode = convert_optab_handler (optab, array_mode, mode))
92 : : == CODE_FOR_nothing)
93 : : {
94 : 39826 : if (dump_enabled_p ())
95 : 4082 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
96 : : "cannot use %s<%s><%s>\n", name,
97 : 4082 : GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
98 : 39826 : return false;
99 : : }
100 : :
101 : 0 : if (dump_enabled_p ())
102 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
103 : 0 : "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
104 : 0 : GET_MODE_NAME (mode));
105 : :
106 : 0 : if (elsvals)
107 : 0 : get_supported_else_vals (icode,
108 : 0 : internal_fn_else_index (IFN_MASK_LEN_LOAD_LANES),
109 : : *elsvals);
110 : :
111 : : return true;
112 : : }
113 : :
114 : : /* Helper function to identify a simd clone call. If this is a call to a
115 : : function with simd clones then return the corresponding cgraph_node,
116 : : otherwise return NULL. */
117 : :
118 : : static cgraph_node*
119 : 566392 : simd_clone_call_p (gimple *stmt)
120 : : {
121 : 635665 : gcall *call = dyn_cast <gcall *> (stmt);
122 : 70873 : if (!call)
123 : : return NULL;
124 : :
125 : 70873 : tree fndecl = NULL_TREE;
126 : 70873 : if (gimple_call_internal_p (call, IFN_MASK_CALL))
127 : 226 : fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
128 : : else
129 : 70647 : fndecl = gimple_call_fndecl (stmt);
130 : :
131 : 70873 : if (fndecl == NULL_TREE)
132 : : return NULL;
133 : :
134 : 36438 : cgraph_node *node = cgraph_node::get (fndecl);
135 : 36438 : if (node && node->simd_clones != NULL)
136 : : return node;
137 : :
138 : : return NULL;
139 : : }
140 : :
141 : :
142 : :
143 : : /* Return the smallest scalar part of STMT_INFO.
144 : : This is used to determine the vectype of the stmt. We generally set the
145 : : vectype according to the type of the result (lhs). For stmts whose
146 : : result-type is different than the type of the arguments (e.g., demotion,
147 : : promotion), vectype will be reset appropriately (later). Note that we have
148 : : to visit the smallest datatype in this function, because that determines the
149 : : VF. If the smallest datatype in the loop is present only as the rhs of a
150 : : promotion operation - we'd miss it.
151 : : Such a case, where a variable of this datatype does not appear in the lhs
152 : : anywhere in the loop, can only occur if it's an invariant: e.g.:
153 : : 'int_x = (int) short_inv', which we'd expect to have been optimized away by
154 : : invariant motion. However, we cannot rely on invariant motion to always
155 : : take invariants out of the loop, and so in the case of promotion we also
156 : : have to check the rhs.
157 : : LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
158 : : types. */
159 : :
160 : : tree
161 : 4647836 : vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
162 : : {
163 : 4647836 : HOST_WIDE_INT lhs, rhs;
164 : :
165 : : /* During the analysis phase, this function is called on arbitrary
166 : : statements that might not have scalar results. */
167 : 4647836 : if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
168 : : return scalar_type;
169 : :
170 : 4647836 : lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
171 : :
172 : 4647836 : gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
173 : 4647836 : if (assign)
174 : : {
175 : 4081444 : scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
176 : 4081444 : if (gimple_assign_cast_p (assign)
177 : 3718391 : || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
178 : 3717999 : || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
179 : 3717999 : || gimple_assign_rhs_code (assign) == SAD_EXPR
180 : 3717916 : || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
181 : 3714188 : || gimple_assign_rhs_code (assign) == WIDEN_MULT_PLUS_EXPR
182 : 3714188 : || gimple_assign_rhs_code (assign) == WIDEN_MULT_MINUS_EXPR
183 : 3714188 : || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
184 : 7795632 : || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
185 : : {
186 : 379125 : tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
187 : :
188 : 379125 : rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
189 : 379125 : if (rhs < lhs)
190 : 4647836 : scalar_type = rhs_type;
191 : : }
192 : : }
193 : 566392 : else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
194 : : {
195 : 1600 : auto clone = node->simd_clones->simdclone;
196 : 4889 : for (unsigned int i = 0; i < clone->nargs; ++i)
197 : : {
198 : 3289 : if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
199 : : {
200 : 1885 : tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
201 : 1885 : rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
202 : 1885 : if (rhs < lhs)
203 : : {
204 : 3289 : scalar_type = arg_scalar_type;
205 : 3289 : lhs = rhs;
206 : : }
207 : : }
208 : : }
209 : : }
210 : 564792 : else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
211 : : {
212 : 69273 : unsigned int i = 0;
213 : 69273 : if (gimple_call_internal_p (call))
214 : : {
215 : 32056 : internal_fn ifn = gimple_call_internal_fn (call);
216 : 32056 : if (internal_load_fn_p (ifn))
217 : : /* For loads the LHS type does the trick. */
218 : : i = ~0U;
219 : 28424 : else if (internal_store_fn_p (ifn))
220 : : {
221 : : /* For stores use the tyep of the stored value. */
222 : 1523 : i = internal_fn_stored_value_index (ifn);
223 : 1523 : scalar_type = TREE_TYPE (gimple_call_arg (call, i));
224 : 1523 : i = ~0U;
225 : : }
226 : 26901 : else if (internal_fn_mask_index (ifn) == 0)
227 : 5406 : i = 1;
228 : : }
229 : 69273 : if (i < gimple_call_num_args (call))
230 : : {
231 : 59544 : tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
232 : 59544 : if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
233 : : {
234 : 59544 : rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
235 : 59544 : if (rhs < lhs)
236 : 4647836 : scalar_type = rhs_type;
237 : : }
238 : : }
239 : : }
240 : :
241 : : return scalar_type;
242 : : }
243 : :
244 : :
245 : : /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
246 : : tested at run-time. Return TRUE if DDR was successfully inserted.
247 : : Return false if versioning is not supported. */
248 : :
249 : : static opt_result
250 : 148094 : vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
251 : : {
252 : 148094 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
253 : :
254 : 148094 : if ((unsigned) param_vect_max_version_for_alias_checks == 0)
255 : 54 : return opt_result::failure_at (vect_location,
256 : : "will not create alias checks, as"
257 : : " --param vect-max-version-for-alias-checks"
258 : : " == 0\n");
259 : :
260 : 148040 : opt_result res
261 : 148040 : = runtime_alias_check_p (ddr, loop,
262 : 148040 : optimize_loop_nest_for_speed_p (loop));
263 : 148040 : if (!res)
264 : 143 : return res;
265 : :
266 : 147897 : LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
267 : 147897 : return opt_result::success ();
268 : : }
269 : :
270 : : /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
271 : :
272 : : static void
273 : 1348 : vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
274 : : {
275 : 1348 : const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
276 : 2071 : for (unsigned int i = 0; i < checks.length(); ++i)
277 : 723 : if (checks[i] == value)
278 : : return;
279 : :
280 : 1348 : if (dump_enabled_p ())
281 : 432 : dump_printf_loc (MSG_NOTE, vect_location,
282 : : "need run-time check that %T is nonzero\n",
283 : : value);
284 : 1348 : LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
285 : : }
286 : :
287 : : /* Return true if we know that the order of vectorized DR_INFO_A and
288 : : vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
289 : : DR_INFO_B. At least one of the accesses is a write. */
290 : :
291 : : static bool
292 : 108630 : vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
293 : : {
294 : 108630 : stmt_vec_info stmtinfo_a = dr_info_a->stmt;
295 : 108630 : stmt_vec_info stmtinfo_b = dr_info_b->stmt;
296 : :
297 : : /* Single statements are always kept in their original order. */
298 : 108630 : if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
299 : 176356 : && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
300 : : return true;
301 : :
302 : : /* If there is a loop invariant read involved we might vectorize it in
303 : : the prologue, breaking scalar oder with respect to the in-loop store. */
304 : 21327 : if ((DR_IS_READ (dr_info_a->dr) && integer_zerop (DR_STEP (dr_info_a->dr)))
305 : 66575 : || (DR_IS_READ (dr_info_b->dr) && integer_zerop (DR_STEP (dr_info_b->dr))))
306 : 1244 : return false;
307 : :
308 : : /* STMT_A and STMT_B belong to overlapping groups. All loads are
309 : : emitted at the position of the first scalar load.
310 : : Stores in a group are emitted at the position of the last scalar store.
311 : : Compute that position and check whether the resulting order matches
312 : : the current one. */
313 : 44879 : stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
314 : 44879 : if (il_a)
315 : : {
316 : 40535 : if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
317 : 161599 : for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
318 : 140883 : s = DR_GROUP_NEXT_ELEMENT (s))
319 : 140883 : il_a = get_later_stmt (il_a, s);
320 : : else /* DR_IS_READ */
321 : 78156 : for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
322 : 58337 : s = DR_GROUP_NEXT_ELEMENT (s))
323 : 58337 : if (get_later_stmt (il_a, s) == il_a)
324 : 1558 : il_a = s;
325 : : }
326 : : else
327 : : il_a = stmtinfo_a;
328 : 44879 : stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
329 : 44879 : if (il_b)
330 : : {
331 : 39188 : if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
332 : 211488 : for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
333 : 180229 : s = DR_GROUP_NEXT_ELEMENT (s))
334 : 180229 : il_b = get_later_stmt (il_b, s);
335 : : else /* DR_IS_READ */
336 : 37357 : for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
337 : 29428 : s = DR_GROUP_NEXT_ELEMENT (s))
338 : 29428 : if (get_later_stmt (il_b, s) == il_b)
339 : 153 : il_b = s;
340 : : }
341 : : else
342 : : il_b = stmtinfo_b;
343 : 44879 : bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
344 : 44879 : return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
345 : : }
346 : :
347 : : /* A subroutine of vect_analyze_data_ref_dependence. Handle
348 : : DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
349 : : distances. These distances are conservatively correct but they don't
350 : : reflect a guaranteed dependence.
351 : :
352 : : Return true if this function does all the work necessary to avoid
353 : : an alias or false if the caller should use the dependence distances
354 : : to limit the vectorization factor in the usual way. LOOP_DEPTH is
355 : : the depth of the loop described by LOOP_VINFO and the other arguments
356 : : are as for vect_analyze_data_ref_dependence. */
357 : :
358 : : static bool
359 : 7970 : vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
360 : : loop_vec_info loop_vinfo,
361 : : int loop_depth, unsigned int *max_vf)
362 : : {
363 : 7970 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
364 : 31898 : for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
365 : : {
366 : 15792 : int dist = dist_v[loop_depth];
367 : 15792 : if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
368 : : {
369 : : /* If the user asserted safelen >= DIST consecutive iterations
370 : : can be executed concurrently, assume independence.
371 : :
372 : : ??? An alternative would be to add the alias check even
373 : : in this case, and vectorize the fallback loop with the
374 : : maximum VF set to safelen. However, if the user has
375 : : explicitly given a length, it's less likely that that
376 : : would be a win. */
377 : 7836 : if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
378 : : {
379 : 32 : if ((unsigned int) loop->safelen < *max_vf)
380 : 2 : *max_vf = loop->safelen;
381 : 32 : LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
382 : 32 : continue;
383 : : }
384 : :
385 : : /* For dependence distances of 2 or more, we have the option
386 : : of limiting VF or checking for an alias at runtime.
387 : : Prefer to check at runtime if we can, to avoid limiting
388 : : the VF unnecessarily when the bases are in fact independent.
389 : :
390 : : Note that the alias checks will be removed if the VF ends up
391 : : being small enough. */
392 : 7804 : dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
393 : 7804 : dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
394 : 7804 : return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
395 : 7804 : && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
396 : 15616 : && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
397 : : }
398 : : }
399 : : return true;
400 : : }
401 : :
402 : :
403 : : /* Function vect_analyze_data_ref_dependence.
404 : :
405 : : FIXME: I needed to change the sense of the returned flag.
406 : :
407 : : Return FALSE if there (might) exist a dependence between a memory-reference
408 : : DRA and a memory-reference DRB. When versioning for alias may check a
409 : : dependence at run-time, return TRUE. Adjust *MAX_VF according to
410 : : the data dependence. */
411 : :
412 : : static opt_result
413 : 976256 : vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
414 : : loop_vec_info loop_vinfo,
415 : : unsigned int *max_vf)
416 : : {
417 : 976256 : unsigned int i;
418 : 976256 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
419 : 976256 : struct data_reference *dra = DDR_A (ddr);
420 : 976256 : struct data_reference *drb = DDR_B (ddr);
421 : 976256 : dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
422 : 976256 : dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
423 : 976256 : stmt_vec_info stmtinfo_a = dr_info_a->stmt;
424 : 976256 : stmt_vec_info stmtinfo_b = dr_info_b->stmt;
425 : 976256 : lambda_vector dist_v;
426 : 976256 : unsigned int loop_depth;
427 : :
428 : : /* If user asserted safelen consecutive iterations can be
429 : : executed concurrently, assume independence. */
430 : 1137048 : auto apply_safelen = [&]()
431 : : {
432 : 160792 : if (loop->safelen >= 2)
433 : : {
434 : 7444 : if ((unsigned int) loop->safelen < *max_vf)
435 : 1896 : *max_vf = loop->safelen;
436 : 7444 : LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
437 : 7444 : return true;
438 : : }
439 : : return false;
440 : 976256 : };
441 : :
442 : : /* In loop analysis all data references should be vectorizable. */
443 : 976256 : if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
444 : 976256 : || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
445 : 0 : gcc_unreachable ();
446 : :
447 : : /* Independent data accesses. */
448 : 976256 : if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
449 : 752770 : return opt_result::success ();
450 : :
451 : 223486 : if (dra == drb
452 : 223486 : || (DR_IS_READ (dra) && DR_IS_READ (drb)))
453 : 0 : return opt_result::success ();
454 : :
455 : : /* We do not have to consider dependences between accesses that belong
456 : : to the same group, unless the stride could be smaller than the
457 : : group size. */
458 : 223486 : if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
459 : 103254 : && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
460 : 103254 : == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
461 : 239729 : && !STMT_VINFO_STRIDED_P (stmtinfo_a))
462 : 2297 : return opt_result::success ();
463 : :
464 : : /* Even if we have an anti-dependence then, as the vectorized loop covers at
465 : : least two scalar iterations, there is always also a true dependence.
466 : : As the vectorizer does not re-order loads and stores we can ignore
467 : : the anti-dependence if TBAA can disambiguate both DRs similar to the
468 : : case with known negative distance anti-dependences (positive
469 : : distance anti-dependences would violate TBAA constraints). */
470 : 100609 : if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
471 : 120580 : || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
472 : 336742 : && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
473 : : get_alias_set (DR_REF (drb))))
474 : 5515 : return opt_result::success ();
475 : :
476 : 215674 : if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
477 : 203895 : || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
478 : : {
479 : 14454 : if (apply_safelen ())
480 : 1396 : return opt_result::success ();
481 : :
482 : 13058 : return opt_result::failure_at
483 : 13058 : (stmtinfo_a->stmt,
484 : : "possible alias involving gather/scatter between %T and %T\n",
485 : : DR_REF (dra), DR_REF (drb));
486 : : }
487 : :
488 : : /* Unknown data dependence. */
489 : 201220 : if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
490 : : {
491 : 145803 : if (apply_safelen ())
492 : 6048 : return opt_result::success ();
493 : :
494 : 139755 : if (dump_enabled_p ())
495 : 7430 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
496 : : "versioning for alias required: "
497 : : "can't determine dependence between %T and %T\n",
498 : : DR_REF (dra), DR_REF (drb));
499 : :
500 : : /* Add to list of ddrs that need to be tested at run-time. */
501 : 139755 : return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
502 : : }
503 : :
504 : : /* Known data dependence. */
505 : 55417 : if (DDR_NUM_DIST_VECTS (ddr) == 0)
506 : : {
507 : 535 : if (apply_safelen ())
508 : 0 : return opt_result::success ();
509 : :
510 : 535 : if (dump_enabled_p ())
511 : 132 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
512 : : "versioning for alias required: "
513 : : "bad dist vector for %T and %T\n",
514 : : DR_REF (dra), DR_REF (drb));
515 : : /* Add to list of ddrs that need to be tested at run-time. */
516 : 535 : return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
517 : : }
518 : :
519 : 54882 : loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
520 : :
521 : 54882 : if (DDR_COULD_BE_INDEPENDENT_P (ddr)
522 : 54882 : && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
523 : : loop_depth, max_vf))
524 : 7962 : return opt_result::success ();
525 : :
526 : 87629 : FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
527 : : {
528 : 46940 : int dist = dist_v[loop_depth];
529 : :
530 : 46940 : if (dump_enabled_p ())
531 : 4242 : dump_printf_loc (MSG_NOTE, vect_location,
532 : : "dependence distance = %d.\n", dist);
533 : :
534 : 46940 : if (dist == 0)
535 : : {
536 : 35934 : if (dump_enabled_p ())
537 : 3448 : dump_printf_loc (MSG_NOTE, vect_location,
538 : : "dependence distance == 0 between %T and %T\n",
539 : : DR_REF (dra), DR_REF (drb));
540 : :
541 : : /* When we perform grouped accesses and perform implicit CSE
542 : : by detecting equal accesses and doing disambiguation with
543 : : runtime alias tests like for
544 : : .. = a[i];
545 : : .. = a[i+1];
546 : : a[i] = ..;
547 : : a[i+1] = ..;
548 : : *p = ..;
549 : : .. = a[i];
550 : : .. = a[i+1];
551 : : where we will end up loading { a[i], a[i+1] } once, make
552 : : sure that inserting group loads before the first load and
553 : : stores after the last store will do the right thing.
554 : : Similar for groups like
555 : : a[i] = ...;
556 : : ... = a[i];
557 : : a[i+1] = ...;
558 : : where loads from the group interleave with the store. */
559 : 35934 : if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
560 : 0 : return opt_result::failure_at (stmtinfo_a->stmt,
561 : : "READ_WRITE dependence"
562 : : " in interleaving.\n");
563 : :
564 : 35934 : if (loop->safelen < 2)
565 : : {
566 : 32180 : tree indicator = dr_zero_step_indicator (dra);
567 : 32180 : if (!indicator || integer_zerop (indicator))
568 : 0 : return opt_result::failure_at (stmtinfo_a->stmt,
569 : : "access also has a zero step\n");
570 : 32180 : else if (TREE_CODE (indicator) != INTEGER_CST)
571 : 1348 : vect_check_nonzero_value (loop_vinfo, indicator);
572 : : }
573 : 35934 : continue;
574 : 35934 : }
575 : :
576 : 11006 : if (dist > 0 && DDR_REVERSED_P (ddr))
577 : : {
578 : : /* If DDR_REVERSED_P the order of the data-refs in DDR was
579 : : reversed (to make distance vector positive), and the actual
580 : : distance is negative. */
581 : 3809 : if (dump_enabled_p ())
582 : 105 : dump_printf_loc (MSG_NOTE, vect_location,
583 : : "dependence distance negative.\n");
584 : : /* When doing outer loop vectorization, we need to check if there is
585 : : a backward dependence at the inner loop level if the dependence
586 : : at the outer loop is reversed. See PR81740. */
587 : 3809 : if (nested_in_vect_loop_p (loop, stmtinfo_a)
588 : 3797 : || nested_in_vect_loop_p (loop, stmtinfo_b))
589 : : {
590 : 12 : unsigned inner_depth = index_in_loop_nest (loop->inner->num,
591 : 12 : DDR_LOOP_NEST (ddr));
592 : 12 : if (dist_v[inner_depth] < 0)
593 : 9 : return opt_result::failure_at (stmtinfo_a->stmt,
594 : : "not vectorized, dependence "
595 : : "between data-refs %T and %T\n",
596 : : DR_REF (dra), DR_REF (drb));
597 : : }
598 : : /* Record a negative dependence distance to later limit the
599 : : amount of stmt copying / unrolling we can perform.
600 : : Only need to handle read-after-write dependence. */
601 : 3800 : if (DR_IS_READ (drb)
602 : 76 : && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
603 : 12 : || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
604 : 76 : STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
605 : 3800 : continue;
606 : 3800 : }
607 : :
608 : 7197 : unsigned int abs_dist = abs (dist);
609 : 7197 : if (abs_dist >= 2 && abs_dist < *max_vf)
610 : : {
611 : : /* The dependence distance requires reduction of the maximal
612 : : vectorization factor. */
613 : 486 : *max_vf = abs_dist;
614 : 486 : if (dump_enabled_p ())
615 : 30 : dump_printf_loc (MSG_NOTE, vect_location,
616 : : "adjusting maximal vectorization factor to %i\n",
617 : : *max_vf);
618 : : }
619 : :
620 : 7197 : if (abs_dist >= *max_vf)
621 : : {
622 : : /* Dependence distance does not create dependence, as far as
623 : : vectorization is concerned, in this case. */
624 : 975 : if (dump_enabled_p ())
625 : 437 : dump_printf_loc (MSG_NOTE, vect_location,
626 : : "dependence distance >= VF.\n");
627 : 975 : continue;
628 : : }
629 : :
630 : 6222 : return opt_result::failure_at (stmtinfo_a->stmt,
631 : : "not vectorized, possible dependence "
632 : : "between data-refs %T and %T\n",
633 : : DR_REF (dra), DR_REF (drb));
634 : : }
635 : :
636 : 40689 : return opt_result::success ();
637 : : }
638 : :
639 : : /* Function vect_analyze_early_break_dependences.
640 : :
641 : : Examine all the data references in the loop and make sure that if we have
642 : : multiple exits that we are able to safely move stores such that they become
643 : : safe for vectorization. The function also calculates the place where to move
644 : : the instructions to and computes what the new vUSE chain should be.
645 : :
646 : : This works in tandem with the CFG that will be produced by
647 : : slpeel_tree_duplicate_loop_to_edge_cfg later on.
648 : :
649 : : This function tries to validate whether an early break vectorization
650 : : is possible for the current instruction sequence. Returns True i
651 : : possible, otherwise False.
652 : :
653 : : Requirements:
654 : : - Any memory access must be to a fixed size buffer.
655 : : - There must not be any loads and stores to the same object.
656 : : - Multiple loads are allowed as long as they don't alias.
657 : :
658 : : NOTE:
659 : : This implementation is very conservative. Any overlapping loads/stores
660 : : that take place before the early break statement gets rejected aside from
661 : : WAR dependencies.
662 : :
663 : : i.e.:
664 : :
665 : : a[i] = 8
666 : : c = a[i]
667 : : if (b[i])
668 : : ...
669 : :
670 : : is not allowed, but
671 : :
672 : : c = a[i]
673 : : a[i] = 8
674 : : if (b[i])
675 : : ...
676 : :
677 : : is which is the common case. */
678 : :
679 : : static opt_result
680 : 138854 : vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
681 : : {
682 : 138854 : DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
683 : :
684 : : /* List of all load data references found during traversal. */
685 : 138854 : auto_vec<data_reference *> bases;
686 : 138854 : basic_block dest_bb = NULL;
687 : :
688 : 138854 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
689 : 138854 : class loop *loop_nest = loop_outer (loop);
690 : :
691 : 138854 : if (dump_enabled_p ())
692 : 1450 : dump_printf_loc (MSG_NOTE, vect_location,
693 : : "loop contains multiple exits, analyzing"
694 : : " statement dependencies.\n");
695 : :
696 : 138854 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
697 : 26390 : if (dump_enabled_p ())
698 : 268 : dump_printf_loc (MSG_NOTE, vect_location,
699 : : "alternate exit has been chosen as main exit.\n");
700 : :
701 : : /* Since we don't support general control flow, the location we'll move the
702 : : side-effects to is always the latch connected exit. When we support
703 : : general control flow we can do better but for now this is fine. Move
704 : : side-effects to the in-loop destination of the last early exit. For the
705 : : PEELED case we move the side-effects to the latch block as this is
706 : : guaranteed to be the last block to be executed when a vector iteration
707 : : finished. */
708 : 138854 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
709 : 26390 : dest_bb = loop->latch;
710 : : else
711 : 112464 : dest_bb = single_pred (loop->latch);
712 : :
713 : : /* We start looking from dest_bb, for the non-PEELED case we don't want to
714 : : move any stores already present, but we do want to read and validate the
715 : : loads. */
716 : 138854 : basic_block bb = dest_bb;
717 : :
718 : : /* We move stores across all loads to the beginning of dest_bb, so
719 : : the first block processed below doesn't need dependence checking. */
720 : 138854 : bool check_deps = false;
721 : :
722 : 501714 : do
723 : : {
724 : 320284 : gimple_stmt_iterator gsi = gsi_last_bb (bb);
725 : :
726 : : /* Now analyze all the remaining statements and try to determine which
727 : : instructions are allowed/needed to be moved. */
728 : 2447111 : while (!gsi_end_p (gsi))
729 : : {
730 : 2127451 : gimple *stmt = gsi_stmt (gsi);
731 : 2127451 : gsi_prev (&gsi);
732 : 2127451 : if (is_gimple_debug (stmt))
733 : 1881637 : continue;
734 : :
735 : 1128101 : stmt_vec_info stmt_vinfo
736 : 1128101 : = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (stmt));
737 : 1128101 : auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
738 : 1128101 : if (!dr_ref)
739 : 871603 : continue;
740 : :
741 : : /* We know everything below dest_bb is safe since we know we
742 : : had a full vector iteration when reaching it. Either by
743 : : the loop entry / IV exit test being last or because this
744 : : is the loop latch itself. */
745 : 256498 : if (!check_deps)
746 : 10684 : continue;
747 : :
748 : : /* Check if vector accesses to the object will be within bounds.
749 : : must be a constant or assume loop will be versioned or niters
750 : : bounded by VF so accesses are within range. We only need to check
751 : : the reads since writes are moved to a safe place where if we get
752 : : there we know they are safe to perform. */
753 : 245814 : if (DR_IS_READ (dr_ref))
754 : : {
755 : 229740 : dr_set_safe_speculative_read_required (stmt_vinfo, true);
756 : 229740 : bool inbounds = ref_within_array_bound (stmt, DR_REF (dr_ref));
757 : 229740 : DR_SCALAR_KNOWN_BOUNDS (STMT_VINFO_DR_INFO (stmt_vinfo)) = inbounds;
758 : :
759 : 229740 : if (dump_enabled_p ())
760 : 2403 : dump_printf_loc (MSG_NOTE, vect_location,
761 : : "marking DR (read) as possibly needing peeling "
762 : : "for alignment at %G", stmt);
763 : : }
764 : :
765 : 245814 : if (DR_IS_READ (dr_ref))
766 : 229740 : bases.safe_push (dr_ref);
767 : 16074 : else if (DR_IS_WRITE (dr_ref))
768 : : {
769 : : /* We are moving writes down in the CFG. To be sure that this
770 : : is valid after vectorization we have to check all the loads
771 : : we are sinking the stores past to see if any of them may
772 : : alias or are the same object.
773 : :
774 : : Same objects will not be an issue because unless the store
775 : : is marked volatile the value can be forwarded. If the
776 : : store is marked volatile we don't vectorize the loop
777 : : anyway.
778 : :
779 : : That leaves the check for aliasing. We don't really need
780 : : to care about the stores aliasing with each other since the
781 : : stores are moved in order so the effects are still observed
782 : : correctly. This leaves the check for WAR dependencies
783 : : which we would be introducing here if the DR can alias.
784 : : The check is quadratic in loads/stores but I have not found
785 : : a better API to do this. I believe all loads and stores
786 : : must be checked. We also must check them when we
787 : : encountered the store, since we don't care about loads past
788 : : the store. */
789 : :
790 : 49322 : for (auto dr_read : bases)
791 : 15484 : if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
792 : : {
793 : 624 : if (dump_enabled_p ())
794 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
795 : : vect_location,
796 : : "early breaks not supported: "
797 : : "overlapping loads and stores "
798 : : "found before the break "
799 : : "statement.\n");
800 : :
801 : 624 : return opt_result::failure_at (stmt,
802 : : "can't safely apply code motion to dependencies"
803 : : " to vectorize the early exit. %G may alias with"
804 : : " %G\n", stmt, dr_read->stmt);
805 : : }
806 : : }
807 : :
808 : 490380 : if (gimple_vdef (stmt))
809 : : {
810 : 15450 : if (dump_enabled_p ())
811 : 280 : dump_printf_loc (MSG_NOTE, vect_location,
812 : : "==> recording stmt %G", stmt);
813 : :
814 : 15450 : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
815 : : }
816 : 704670 : else if (gimple_vuse (stmt))
817 : : {
818 : 229740 : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
819 : 229740 : if (dump_enabled_p ())
820 : 2403 : dump_printf_loc (MSG_NOTE, vect_location,
821 : : "marked statement for vUSE update: %G", stmt);
822 : : }
823 : : }
824 : :
825 : 319660 : if (!single_pred_p (bb))
826 : : {
827 : 138230 : gcc_assert (bb == loop->header);
828 : 138230 : break;
829 : : }
830 : :
831 : : /* If we possibly sink through a virtual PHI make sure to elide that. */
832 : 181430 : if (gphi *vphi = get_virtual_phi (bb))
833 : 107 : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (vphi);
834 : :
835 : : /* All earlier blocks need dependence checking. */
836 : 181430 : check_deps = true;
837 : 181430 : bb = single_pred (bb);
838 : 181430 : }
839 : : while (1);
840 : :
841 : : /* We don't allow outer -> inner loop transitions which should have been
842 : : trapped already during loop form analysis. */
843 : 138230 : gcc_assert (dest_bb->loop_father == loop);
844 : :
845 : : /* Check that the destination block we picked has only one pred. To relax this we
846 : : have to take special care when moving the statements. We don't currently support
847 : : such control flow however this check is there to simplify how we handle
848 : : labels that may be present anywhere in the IL. This check is to ensure that the
849 : : labels aren't significant for the CFG. */
850 : 138230 : if (!single_pred (dest_bb))
851 : 0 : return opt_result::failure_at (vect_location,
852 : : "chosen loop exit block (BB %d) does not have a "
853 : : "single predecessor which is currently not "
854 : : "supported for early break vectorization.\n",
855 : : dest_bb->index);
856 : :
857 : 138230 : LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
858 : :
859 : 138230 : if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
860 : : {
861 : : /* All uses shall be updated to that of the first load. Entries are
862 : : stored in reverse order. */
863 : 127814 : tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
864 : 356756 : for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
865 : : {
866 : 228942 : if (dump_enabled_p ())
867 : 2395 : dump_printf_loc (MSG_NOTE, vect_location,
868 : : "will update use: %T, mem_ref: %G", vuse, g);
869 : : }
870 : : }
871 : :
872 : 138230 : if (dump_enabled_p ())
873 : 1446 : dump_printf_loc (MSG_NOTE, vect_location,
874 : : "recorded statements to be moved to BB %d\n",
875 : 1446 : LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
876 : :
877 : 138230 : return opt_result::success ();
878 : 138854 : }
879 : :
880 : : /* Function vect_analyze_data_ref_dependences.
881 : :
882 : : Examine all the data references in the loop, and make sure there do not
883 : : exist any data dependences between them. Set *MAX_VF according to
884 : : the maximum vectorization factor the data dependences allow. */
885 : :
886 : : opt_result
887 : 321461 : vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
888 : : unsigned int *max_vf)
889 : : {
890 : 321461 : unsigned int i;
891 : 321461 : struct data_dependence_relation *ddr;
892 : :
893 : 321461 : DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
894 : :
895 : 321461 : if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
896 : : {
897 : 159800 : LOOP_VINFO_DDRS (loop_vinfo)
898 : 159800 : .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
899 : 159800 : * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
900 : : /* We do not need read-read dependences. */
901 : 319600 : bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
902 : : &LOOP_VINFO_DDRS (loop_vinfo),
903 : 159800 : LOOP_VINFO_LOOP_NEST (loop_vinfo),
904 : : false);
905 : 159800 : gcc_assert (res);
906 : : }
907 : :
908 : 321461 : LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
909 : :
910 : : /* For epilogues we either have no aliases or alias versioning
911 : : was applied to original loop. Therefore we may just get max_vf
912 : : using VF of original loop. */
913 : 321461 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
914 : 14926 : *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
915 : : else
916 : 1263313 : FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
917 : : {
918 : 976256 : opt_result res
919 : 976256 : = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
920 : 976256 : if (!res)
921 : 19478 : return res;
922 : : }
923 : :
924 : : /* If we have early break statements in the loop, check to see if they
925 : : are of a form we can vectorizer. */
926 : 301983 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
927 : 138854 : return vect_analyze_early_break_dependences (loop_vinfo);
928 : :
929 : 163129 : return opt_result::success ();
930 : : }
931 : :
932 : :
933 : : /* Function vect_slp_analyze_data_ref_dependence.
934 : :
935 : : Return TRUE if there (might) exist a dependence between a memory-reference
936 : : DRA and a memory-reference DRB for VINFO. When versioning for alias
937 : : may check a dependence at run-time, return FALSE. Adjust *MAX_VF
938 : : according to the data dependence. */
939 : :
940 : : static bool
941 : 6840419 : vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
942 : : struct data_dependence_relation *ddr)
943 : : {
944 : 6840419 : struct data_reference *dra = DDR_A (ddr);
945 : 6840419 : struct data_reference *drb = DDR_B (ddr);
946 : 6840419 : dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
947 : 6840419 : dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
948 : :
949 : : /* We need to check dependences of statements marked as unvectorizable
950 : : as well, they still can prohibit vectorization. */
951 : :
952 : : /* Independent data accesses. */
953 : 6840419 : if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
954 : : return false;
955 : :
956 : 1092588 : if (dra == drb)
957 : : return false;
958 : :
959 : : /* Read-read is OK. */
960 : 8437 : if (DR_IS_READ (dra) && DR_IS_READ (drb))
961 : : return false;
962 : :
963 : : /* If dra and drb are part of the same interleaving chain consider
964 : : them independent. */
965 : 8437 : if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
966 : 8437 : && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
967 : 8437 : == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
968 : : return false;
969 : :
970 : : /* Unknown data dependence. */
971 : 8437 : if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
972 : : {
973 : 8437 : if (dump_enabled_p ())
974 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
975 : : "can't determine dependence between %T and %T\n",
976 : : DR_REF (dra), DR_REF (drb));
977 : : }
978 : 0 : else if (dump_enabled_p ())
979 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
980 : : "determined dependence between %T and %T\n",
981 : : DR_REF (dra), DR_REF (drb));
982 : :
983 : : return true;
984 : : }
985 : :
986 : :
987 : : /* Analyze dependences involved in the transform of a store SLP NODE. */
988 : :
989 : : static bool
990 : 655469 : vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
991 : : {
992 : : /* This walks over all stmts involved in the SLP store done
993 : : in NODE verifying we can sink them up to the last stmt in the
994 : : group. */
995 : 655469 : stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
996 : 655469 : gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
997 : :
998 : 2378902 : for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
999 : : {
1000 : 1731840 : stmt_vec_info access_info
1001 : 1731840 : = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1002 : 1731840 : if (access_info == last_access_info)
1003 : 647689 : continue;
1004 : 1084151 : data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1005 : 1084151 : ao_ref ref;
1006 : 1084151 : bool ref_initialized_p = false;
1007 : 1084151 : for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1008 : 10556184 : gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
1009 : : {
1010 : 9480440 : gimple *stmt = gsi_stmt (gsi);
1011 : 16805875 : if (! gimple_vuse (stmt))
1012 : 2639703 : continue;
1013 : :
1014 : : /* If we couldn't record a (single) data reference for this
1015 : : stmt we have to resort to the alias oracle. */
1016 : 6840737 : stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1017 : 6840737 : data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1018 : 6840737 : if (!dr_b)
1019 : : {
1020 : : /* We are moving a store - this means
1021 : : we cannot use TBAA for disambiguation. */
1022 : 536 : if (!ref_initialized_p)
1023 : 536 : ao_ref_init (&ref, DR_REF (dr_a));
1024 : 536 : if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
1025 : 536 : || ref_maybe_used_by_stmt_p (stmt, &ref, false))
1026 : 8407 : return false;
1027 : 532 : continue;
1028 : : }
1029 : :
1030 : 6840201 : gcc_assert (!gimple_visited_p (stmt));
1031 : :
1032 : 6840201 : ddr_p ddr = initialize_data_dependence_relation (dr_a,
1033 : 6840201 : dr_b, vNULL);
1034 : 6840201 : bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1035 : 6840201 : free_dependence_relation (ddr);
1036 : 6840201 : if (dependent)
1037 : : return false;
1038 : : }
1039 : : }
1040 : : return true;
1041 : : }
1042 : :
1043 : : /* Analyze dependences involved in the transform of a load SLP NODE. STORES
1044 : : contain the vector of scalar stores of this instance if we are
1045 : : disambiguating the loads. */
1046 : :
1047 : : static bool
1048 : 154690 : vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1049 : : vec<stmt_vec_info> stores,
1050 : : stmt_vec_info last_store_info)
1051 : : {
1052 : : /* This walks over all stmts involved in the SLP load done
1053 : : in NODE verifying we can hoist them up to the first stmt in the
1054 : : group. */
1055 : 154690 : stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1056 : 154690 : gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1057 : :
1058 : 538022 : for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1059 : : {
1060 : 383366 : if (! SLP_TREE_SCALAR_STMTS (node)[k])
1061 : 161912 : continue;
1062 : 383366 : stmt_vec_info access_info
1063 : 383366 : = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1064 : 383366 : if (access_info == first_access_info)
1065 : 161912 : continue;
1066 : 221454 : data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1067 : 221454 : ao_ref ref;
1068 : 221454 : bool ref_initialized_p = false;
1069 : 221454 : hash_set<stmt_vec_info> grp_visited;
1070 : 221454 : for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1071 : 4317942 : gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1072 : : {
1073 : 2048278 : gimple *stmt = gsi_stmt (gsi);
1074 : 3342711 : if (! gimple_vdef (stmt))
1075 : 1992674 : continue;
1076 : :
1077 : 279040 : stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1078 : :
1079 : : /* If we run into a store of this same instance (we've just
1080 : : marked those) then delay dependence checking until we run
1081 : : into the last store because this is where it will have
1082 : : been sunk to (and we verified that we can do that already). */
1083 : 279040 : if (gimple_visited_p (stmt))
1084 : : {
1085 : 223436 : if (stmt_info != last_store_info)
1086 : 223434 : continue;
1087 : :
1088 : 10 : for (stmt_vec_info &store_info : stores)
1089 : : {
1090 : 4 : data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1091 : 4 : ddr_p ddr = initialize_data_dependence_relation
1092 : 4 : (dr_a, store_dr, vNULL);
1093 : 4 : bool dependent
1094 : 4 : = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1095 : 4 : free_dependence_relation (ddr);
1096 : 4 : if (dependent)
1097 : 34 : return false;
1098 : : }
1099 : 2 : continue;
1100 : 2 : }
1101 : :
1102 : 114109 : auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1103 : : {
1104 : : /* We are hoisting a load - this means we can use TBAA for
1105 : : disambiguation. */
1106 : 58505 : if (!ref_initialized_p)
1107 : 58505 : ao_ref_init (&ref, DR_REF (dr_a));
1108 : 58505 : if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1109 : : {
1110 : : /* If we couldn't record a (single) data reference for this
1111 : : stmt we have to give up now. */
1112 : 214 : data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1113 : 214 : if (!dr_b)
1114 : : return false;
1115 : 214 : ddr_p ddr = initialize_data_dependence_relation (dr_a,
1116 : 214 : dr_b, vNULL);
1117 : 214 : bool dependent
1118 : 214 : = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1119 : 214 : free_dependence_relation (ddr);
1120 : 214 : if (dependent)
1121 : : return false;
1122 : : }
1123 : : /* No dependence. */
1124 : : return true;
1125 : 55604 : };
1126 : 55604 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1127 : : {
1128 : : /* When we run into a store group we have to honor
1129 : : that earlier stores might be moved here. We don't
1130 : : know exactly which and where to since we lack a
1131 : : back-mapping from DR to SLP node, so assume all
1132 : : earlier stores are sunk here. It's enough to
1133 : : consider the last stmt of a group for this.
1134 : : ??? Both this and the fact that we disregard that
1135 : : the conflicting instance might be removed later
1136 : : is overly conservative. */
1137 : 55158 : if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1138 : 10661 : for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1139 : 128999 : store_info != NULL;
1140 : 118338 : store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1141 : 118372 : if ((store_info == stmt_info
1142 : 107720 : || get_later_stmt (store_info, stmt_info) == stmt_info)
1143 : 165779 : && !check_hoist (store_info))
1144 : : return false;
1145 : : }
1146 : : else
1147 : : {
1148 : 446 : if (!check_hoist (stmt_info))
1149 : : return false;
1150 : : }
1151 : : }
1152 : 221454 : }
1153 : : return true;
1154 : : }
1155 : :
1156 : :
1157 : : /* Function vect_analyze_data_ref_dependences.
1158 : :
1159 : : Examine all the data references in the basic-block, and make sure there
1160 : : do not exist any data dependences between them. Set *MAX_VF according to
1161 : : the maximum vectorization factor the data dependences allow. */
1162 : :
1163 : : bool
1164 : 787782 : vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1165 : : {
1166 : 787782 : DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1167 : :
1168 : : /* The stores of this instance are at the root of the SLP tree. */
1169 : 787782 : slp_tree store = NULL;
1170 : 787782 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1171 : 655469 : store = SLP_INSTANCE_TREE (instance);
1172 : :
1173 : : /* Verify we can sink stores to the vectorized stmt insert location. */
1174 : 655469 : stmt_vec_info last_store_info = NULL;
1175 : 655469 : if (store)
1176 : : {
1177 : 655469 : if (! vect_slp_analyze_store_dependences (vinfo, store))
1178 : : return false;
1179 : :
1180 : : /* Mark stores in this instance and remember the last one. */
1181 : 647062 : last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1182 : 2369838 : for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1183 : 1722776 : gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1184 : : }
1185 : :
1186 : 779375 : bool res = true;
1187 : :
1188 : : /* Verify we can sink loads to the vectorized stmt insert location,
1189 : : special-casing stores of this instance. */
1190 : 1173467 : for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1191 : 154690 : if (! vect_slp_analyze_load_dependences (vinfo, load,
1192 : : store
1193 : : ? SLP_TREE_SCALAR_STMTS (store)
1194 : : : vNULL, last_store_info))
1195 : : {
1196 : : res = false;
1197 : : break;
1198 : : }
1199 : :
1200 : : /* Unset the visited flag. */
1201 : 779375 : if (store)
1202 : 2369838 : for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1203 : 1722776 : gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1204 : :
1205 : : /* If this is a SLP instance with a store check if there's a dependent
1206 : : load that cannot be forwarded from a previous iteration of a loop
1207 : : both are in. This is to avoid situations like that in PR115777. */
1208 : 779375 : if (res && store)
1209 : : {
1210 : 647038 : stmt_vec_info store_info
1211 : 647038 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (store)[0]);
1212 : 647038 : class loop *store_loop = gimple_bb (store_info->stmt)->loop_father;
1213 : 647038 : if (! loop_outer (store_loop))
1214 : 552575 : return res;
1215 : 94463 : vec<loop_p> loop_nest;
1216 : 94463 : loop_nest.create (1);
1217 : 94463 : loop_nest.quick_push (store_loop);
1218 : 94463 : data_reference *drs = nullptr;
1219 : 175474 : for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1220 : : {
1221 : 36158 : if (! STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (load)[0]))
1222 : 0 : continue;
1223 : 36158 : stmt_vec_info load_info
1224 : 36158 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (load)[0]);
1225 : 36158 : if (gimple_bb (load_info->stmt)->loop_father != store_loop)
1226 : 5073 : continue;
1227 : :
1228 : : /* For now concern ourselves with write-after-read as we also
1229 : : only look for re-use of the store within the same SLP instance.
1230 : : We can still get a RAW here when the instance contais a PHI
1231 : : with a backedge though, thus this test. */
1232 : 31085 : if (! vect_stmt_dominates_stmt_p (STMT_VINFO_STMT (load_info),
1233 : : STMT_VINFO_STMT (store_info)))
1234 : 11865 : continue;
1235 : :
1236 : 19220 : if (! drs)
1237 : : {
1238 : 18362 : drs = create_data_ref (loop_preheader_edge (store_loop),
1239 : : store_loop,
1240 : 18362 : DR_REF (STMT_VINFO_DATA_REF (store_info)),
1241 : : store_info->stmt, false, false);
1242 : 18362 : if (! DR_BASE_ADDRESS (drs)
1243 : 15506 : || TREE_CODE (DR_STEP (drs)) != INTEGER_CST)
1244 : : break;
1245 : : }
1246 : 16063 : data_reference *drl
1247 : 16063 : = create_data_ref (loop_preheader_edge (store_loop),
1248 : : store_loop,
1249 : 16063 : DR_REF (STMT_VINFO_DATA_REF (load_info)),
1250 : : load_info->stmt, true, false);
1251 : :
1252 : : /* See whether the DRs have a known constant distance throughout
1253 : : the containing loop iteration. */
1254 : 30413 : if (! DR_BASE_ADDRESS (drl)
1255 : 14302 : || ! operand_equal_p (DR_STEP (drs), DR_STEP (drl))
1256 : 8368 : || ! operand_equal_p (DR_BASE_ADDRESS (drs),
1257 : 8368 : DR_BASE_ADDRESS (drl))
1258 : 17780 : || ! operand_equal_p (DR_OFFSET (drs), DR_OFFSET (drl)))
1259 : : {
1260 : 14350 : free_data_ref (drl);
1261 : 14350 : continue;
1262 : : }
1263 : :
1264 : : /* If the next iteration load overlaps with a non-power-of-two offset
1265 : : we are surely failing any STLF attempt. */
1266 : 1713 : HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (drl));
1267 : 1713 : unsigned HOST_WIDE_INT sizes
1268 : 1713 : = (TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drs))))
1269 : 1713 : * DR_GROUP_SIZE (store_info));
1270 : 1713 : unsigned HOST_WIDE_INT sizel
1271 : 1713 : = (TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drl))))
1272 : 1713 : * DR_GROUP_SIZE (load_info));
1273 : 1713 : if (ranges_overlap_p (TREE_INT_CST_LOW (DR_INIT (drl)) + step, sizel,
1274 : 1713 : TREE_INT_CST_LOW (DR_INIT (drs)), sizes))
1275 : : {
1276 : 834 : unsigned HOST_WIDE_INT dist
1277 : 834 : = absu_hwi (TREE_INT_CST_LOW (DR_INIT (drl)) + step
1278 : 834 : - TREE_INT_CST_LOW (DR_INIT (drs)));
1279 : 834 : poly_uint64 loadsz = tree_to_poly_uint64
1280 : 834 : (TYPE_SIZE_UNIT (SLP_TREE_VECTYPE (load)));
1281 : 834 : poly_uint64 storesz = tree_to_poly_uint64
1282 : 834 : (TYPE_SIZE_UNIT (SLP_TREE_VECTYPE (store)));
1283 : : /* When the overlap aligns with vector sizes used for the loads
1284 : : and the vector stores are larger or equal to the loads
1285 : : forwarding should work. */
1286 : 1668 : if (maybe_gt (loadsz, storesz) || ! multiple_p (dist, loadsz))
1287 : 73 : load->avoid_stlf_fail = true;
1288 : : }
1289 : 1713 : free_data_ref (drl);
1290 : : }
1291 : 94463 : if (drs)
1292 : 18362 : free_data_ref (drs);
1293 : 94463 : loop_nest.release ();
1294 : : }
1295 : :
1296 : : return res;
1297 : : }
1298 : :
1299 : : /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1300 : : applied. */
1301 : :
1302 : : int
1303 : 5761737 : dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1304 : : {
1305 : 5761737 : HOST_WIDE_INT diff = 0;
1306 : : /* Alignment is only analyzed for the first element of a DR group,
1307 : : use that but adjust misalignment by the offset of the access. */
1308 : 5761737 : if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1309 : : {
1310 : 2238774 : dr_vec_info *first_dr
1311 : 2238774 : = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1312 : : /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1313 : : INTEGER_CSTs and the first element in the group has the lowest
1314 : : address. */
1315 : 2238774 : diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1316 : 2238774 : - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1317 : 2238774 : gcc_assert (diff >= 0);
1318 : : dr_info = first_dr;
1319 : : }
1320 : :
1321 : 5761737 : int misalign = dr_info->misalignment;
1322 : 5761737 : gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1323 : 5761737 : if (misalign == DR_MISALIGNMENT_UNKNOWN)
1324 : : return misalign;
1325 : :
1326 : : /* If the access is only aligned for a vector type with smaller alignment
1327 : : requirement the access has unknown misalignment. */
1328 : 3489688 : if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1329 : 3489688 : targetm.vectorize.preferred_vector_alignment (vectype)))
1330 : : return DR_MISALIGNMENT_UNKNOWN;
1331 : :
1332 : : /* Apply the offset from the DR group start and the externally supplied
1333 : : offset which can for example result from a negative stride access. */
1334 : 3489685 : poly_int64 misalignment = misalign + diff + offset;
1335 : :
1336 : : /* Below we reject compile-time non-constant target alignments, but if
1337 : : our misalignment is zero, then we are known to already be aligned
1338 : : w.r.t. any such possible target alignment. */
1339 : 3489685 : if (known_eq (misalignment, 0))
1340 : : return 0;
1341 : :
1342 : 625414 : unsigned HOST_WIDE_INT target_alignment_c;
1343 : 625414 : if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1344 : 625414 : || !known_misalignment (misalignment, target_alignment_c, &misalign))
1345 : : return DR_MISALIGNMENT_UNKNOWN;
1346 : 625414 : return misalign;
1347 : : }
1348 : :
1349 : : /* Record the base alignment guarantee given by DRB, which occurs
1350 : : in STMT_INFO. */
1351 : :
1352 : : static void
1353 : 4471140 : vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1354 : : innermost_loop_behavior *drb)
1355 : : {
1356 : 4471140 : bool existed;
1357 : 4471140 : std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1358 : 4471140 : = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1359 : 4471140 : if (!existed || entry.second->base_alignment < drb->base_alignment)
1360 : : {
1361 : 1344946 : entry = std::make_pair (stmt_info, drb);
1362 : 1344946 : if (dump_enabled_p ())
1363 : 31202 : dump_printf_loc (MSG_NOTE, vect_location,
1364 : : "recording new base alignment for %T\n"
1365 : : " alignment: %d\n"
1366 : : " misalignment: %d\n"
1367 : : " based on: %G",
1368 : : drb->base_address,
1369 : : drb->base_alignment,
1370 : : drb->base_misalignment,
1371 : : stmt_info->stmt);
1372 : : }
1373 : 4471140 : }
1374 : :
1375 : : /* If the region we're going to vectorize is reached, all unconditional
1376 : : data references occur at least once. We can therefore pool the base
1377 : : alignment guarantees from each unconditional reference. Do this by
1378 : : going through all the data references in VINFO and checking whether
1379 : : the containing statement makes the reference unconditionally. If so,
1380 : : record the alignment of the base address in VINFO so that it can be
1381 : : used for all other references with the same base. */
1382 : :
1383 : : void
1384 : 986525 : vect_record_base_alignments (vec_info *vinfo)
1385 : : {
1386 : 986525 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1387 : 366447 : class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1388 : 14910833 : for (data_reference *dr : vinfo->shared->datarefs)
1389 : : {
1390 : 12047512 : dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1391 : 12047512 : stmt_vec_info stmt_info = dr_info->stmt;
1392 : 12047512 : if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1393 : 12039942 : && STMT_VINFO_VECTORIZABLE (stmt_info)
1394 : 4487884 : && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1395 : : {
1396 : 4469729 : vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1397 : :
1398 : : /* If DR is nested in the loop that is being vectorized, we can also
1399 : : record the alignment of the base wrt the outer loop. */
1400 : 12855870 : if (loop && nested_in_vect_loop_p (loop, stmt_info))
1401 : 1411 : vect_record_base_alignment
1402 : 1411 : (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1403 : : }
1404 : : }
1405 : 986525 : }
1406 : :
1407 : : /* Function vect_compute_data_ref_alignment
1408 : :
1409 : : Compute the misalignment of the data reference DR_INFO when vectorizing
1410 : : with VECTYPE.
1411 : :
1412 : : Output:
1413 : : 1. initialized misalignment info for DR_INFO
1414 : :
1415 : : FOR NOW: No analysis is actually performed. Misalignment is calculated
1416 : : only for trivial cases. TODO. */
1417 : :
1418 : : static void
1419 : 1486128 : vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1420 : : tree vectype)
1421 : : {
1422 : 1486128 : stmt_vec_info stmt_info = dr_info->stmt;
1423 : 1486128 : vec_base_alignments *base_alignments = &vinfo->base_alignments;
1424 : 1486128 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1425 : 1486128 : class loop *loop = NULL;
1426 : 1486128 : tree ref = DR_REF (dr_info->dr);
1427 : :
1428 : 1486128 : if (dump_enabled_p ())
1429 : 49355 : dump_printf_loc (MSG_NOTE, vect_location,
1430 : : "vect_compute_data_ref_alignment:\n");
1431 : :
1432 : 1486128 : if (loop_vinfo)
1433 : 712514 : loop = LOOP_VINFO_LOOP (loop_vinfo);
1434 : :
1435 : : /* Initialize misalignment to unknown. */
1436 : 1486128 : SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1437 : :
1438 : 1486128 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1439 : : return;
1440 : :
1441 : 1466425 : innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1442 : 1466425 : bool step_preserves_misalignment_p;
1443 : :
1444 : 1466425 : poly_uint64 vector_alignment
1445 : 1466425 : = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1446 : : BITS_PER_UNIT);
1447 : :
1448 : 1466425 : if (loop_vinfo
1449 : 1466425 : && dr_safe_speculative_read_required (stmt_info))
1450 : : {
1451 : : /* The required target alignment must be a power-of-2 value and is
1452 : : computed as the product of vector element size, VF and group size.
1453 : : We compute the constant part first as VF may be a variable. For
1454 : : variable VF, the power-of-2 check of VF is deferred to runtime. */
1455 : 303795 : auto align_factor_c
1456 : 303795 : = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1457 : 303795 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1458 : 82152 : align_factor_c *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1459 : :
1460 : 303795 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1461 : 303795 : poly_uint64 new_alignment = vf * align_factor_c;
1462 : :
1463 : 607590 : if ((vf.is_constant () && pow2p_hwi (new_alignment.to_constant ()))
1464 : : || (!vf.is_constant () && pow2p_hwi (align_factor_c)))
1465 : : {
1466 : 250086 : if (dump_enabled_p ())
1467 : : {
1468 : 3108 : dump_printf_loc (MSG_NOTE, vect_location,
1469 : : "alignment increased due to early break to ");
1470 : 3108 : dump_dec (MSG_NOTE, new_alignment);
1471 : 3108 : dump_printf (MSG_NOTE, " bytes.\n");
1472 : : }
1473 : 250086 : vector_alignment = new_alignment;
1474 : : }
1475 : : }
1476 : :
1477 : 1466425 : SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1478 : :
1479 : : /* If the main loop has peeled for alignment we have no way of knowing
1480 : : whether the data accesses in the epilogues are aligned. We can't at
1481 : : compile time answer the question whether we have entered the main loop or
1482 : : not. Fixes PR 92351. */
1483 : 1466425 : if (loop_vinfo)
1484 : : {
1485 : 692811 : loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1486 : 692811 : if (orig_loop_vinfo
1487 : 32280 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1488 : : return;
1489 : : }
1490 : :
1491 : 1466208 : unsigned HOST_WIDE_INT vect_align_c;
1492 : 1466208 : if (!vector_alignment.is_constant (&vect_align_c))
1493 : : return;
1494 : :
1495 : : /* No step for BB vectorization. */
1496 : 1466208 : if (!loop)
1497 : : {
1498 : 773614 : gcc_assert (integer_zerop (drb->step));
1499 : : step_preserves_misalignment_p = true;
1500 : : }
1501 : :
1502 : : else
1503 : : {
1504 : : /* We can only use base and misalignment information relative to
1505 : : an innermost loop if the misalignment stays the same throughout the
1506 : : execution of the loop. As above, this is the case if the stride of
1507 : : the dataref evenly divides by the alignment. Make sure to check
1508 : : previous epilogues and the main loop. */
1509 : : step_preserves_misalignment_p = true;
1510 : : auto lvinfo = loop_vinfo;
1511 : 1417980 : while (lvinfo)
1512 : : {
1513 : 725386 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (lvinfo);
1514 : 725386 : step_preserves_misalignment_p
1515 : 725386 : &= multiple_p (drb->step_alignment * vf, vect_align_c);
1516 : 725386 : lvinfo = LOOP_VINFO_ORIG_LOOP_INFO (lvinfo);
1517 : : }
1518 : :
1519 : 692594 : if (!step_preserves_misalignment_p && dump_enabled_p ())
1520 : 285 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1521 : : "step doesn't divide the vector alignment.\n");
1522 : :
1523 : : /* In case the dataref is in an inner-loop of the loop that is being
1524 : : vectorized (LOOP), we use the base and misalignment information
1525 : : relative to the outer-loop (LOOP). This is ok only if the
1526 : : misalignment stays the same throughout the execution of the
1527 : : inner-loop, which is why we have to check that the stride of the
1528 : : dataref in the inner-loop evenly divides by the vector alignment. */
1529 : 692594 : if (step_preserves_misalignment_p
1530 : 692594 : && nested_in_vect_loop_p (loop, stmt_info))
1531 : : {
1532 : 1410 : step_preserves_misalignment_p
1533 : 1410 : = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1534 : :
1535 : 1410 : if (dump_enabled_p ())
1536 : : {
1537 : 495 : if (step_preserves_misalignment_p)
1538 : 355 : dump_printf_loc (MSG_NOTE, vect_location,
1539 : : "inner step divides the vector alignment.\n");
1540 : : else
1541 : 140 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1542 : : "inner step doesn't divide the vector"
1543 : : " alignment.\n");
1544 : : }
1545 : : }
1546 : : }
1547 : :
1548 : 1466208 : unsigned int base_alignment = drb->base_alignment;
1549 : 1466208 : unsigned int base_misalignment = drb->base_misalignment;
1550 : :
1551 : : /* Calculate the maximum of the pooled base address alignment and the
1552 : : alignment that we can compute for DR itself. */
1553 : 1466208 : std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1554 : 1466208 : = base_alignments->get (drb->base_address);
1555 : 1466208 : if (entry
1556 : 1463297 : && base_alignment < (*entry).second->base_alignment
1557 : 1468026 : && (loop_vinfo
1558 : 1136 : || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1559 : 1136 : gimple_bb (entry->first->stmt))
1560 : 991 : && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1561 : 759 : || (entry->first->dr_aux.group <= dr_info->group)))))
1562 : : {
1563 : 1656 : base_alignment = entry->second->base_alignment;
1564 : 1656 : base_misalignment = entry->second->base_misalignment;
1565 : : }
1566 : :
1567 : 1466208 : if (drb->offset_alignment < vect_align_c
1568 : 1396078 : || !step_preserves_misalignment_p
1569 : : /* We need to know whether the step wrt the vectorized loop is
1570 : : negative when computing the starting misalignment below. */
1571 : 1386179 : || TREE_CODE (drb->step) != INTEGER_CST)
1572 : : {
1573 : 107656 : if (dump_enabled_p ())
1574 : 3614 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1575 : : "Unknown alignment for access: %T\n", ref);
1576 : 107656 : return;
1577 : : }
1578 : :
1579 : 1358552 : if (base_alignment < vect_align_c)
1580 : : {
1581 : 681202 : unsigned int max_alignment;
1582 : 681202 : tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1583 : 681202 : if (max_alignment < vect_align_c
1584 : 678827 : || (loop_vinfo && LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1585 : 1340957 : || !vect_can_force_dr_alignment_p (base,
1586 : 659755 : vect_align_c * BITS_PER_UNIT))
1587 : : {
1588 : 485076 : if (dump_enabled_p ())
1589 : 13196 : dump_printf_loc (MSG_NOTE, vect_location,
1590 : : "can't force alignment of ref: %T\n", ref);
1591 : 485076 : return;
1592 : : }
1593 : :
1594 : : /* Force the alignment of the decl.
1595 : : NOTE: This is the only change to the code we make during
1596 : : the analysis phase, before deciding to vectorize the loop. */
1597 : 196126 : if (dump_enabled_p ())
1598 : 7956 : dump_printf_loc (MSG_NOTE, vect_location,
1599 : : "force alignment of %T\n", ref);
1600 : :
1601 : 196126 : dr_info->base_decl = base;
1602 : 196126 : dr_info->base_misaligned = true;
1603 : 196126 : base_misalignment = 0;
1604 : : }
1605 : 873476 : poly_int64 misalignment
1606 : 873476 : = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1607 : :
1608 : 873476 : unsigned int const_misalignment;
1609 : 873476 : if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1610 : : {
1611 : : if (dump_enabled_p ())
1612 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1613 : : "Non-constant misalignment for access: %T\n", ref);
1614 : : return;
1615 : : }
1616 : :
1617 : 873476 : SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1618 : :
1619 : 873476 : if (dump_enabled_p ())
1620 : 31365 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1621 : : "misalign = %d bytes of ref %T\n",
1622 : : const_misalignment, ref);
1623 : :
1624 : : return;
1625 : : }
1626 : :
1627 : : /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1628 : : that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1629 : : is made aligned via peeling. */
1630 : :
1631 : : static bool
1632 : 1463150 : vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1633 : : dr_vec_info *dr_peel_info)
1634 : : {
1635 : 1463150 : if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1636 : 1464125 : DR_TARGET_ALIGNMENT (dr_info)))
1637 : : {
1638 : 1462175 : poly_offset_int diff
1639 : 1462175 : = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1640 : 1462175 : - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1641 : 1462175 : if (known_eq (diff, 0)
1642 : 1462175 : || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1643 : 481223 : return true;
1644 : : }
1645 : : return false;
1646 : : }
1647 : :
1648 : : /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1649 : : aligned via peeling. */
1650 : :
1651 : : static bool
1652 : 157432 : vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1653 : : dr_vec_info *dr_peel_info)
1654 : : {
1655 : 157432 : if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1656 : 157432 : DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1657 : 35882 : || !operand_equal_p (DR_OFFSET (dr_info->dr),
1658 : 35882 : DR_OFFSET (dr_peel_info->dr), 0)
1659 : 192431 : || !operand_equal_p (DR_STEP (dr_info->dr),
1660 : 34999 : DR_STEP (dr_peel_info->dr), 0))
1661 : 122804 : return false;
1662 : :
1663 : 34628 : return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1664 : : }
1665 : :
1666 : : /* Compute the value for dr_info->misalign so that the access appears
1667 : : aligned. This is used by peeling to compensate for dr_misalignment
1668 : : applying the offset for negative step. */
1669 : :
1670 : : int
1671 : 22374 : vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1672 : : {
1673 : 22374 : if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1674 : : return 0;
1675 : :
1676 : 206 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1677 : 206 : poly_int64 misalignment
1678 : 206 : = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1679 : 206 : * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1680 : :
1681 : 206 : unsigned HOST_WIDE_INT target_alignment_c;
1682 : 206 : int misalign;
1683 : 206 : if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1684 : 206 : || !known_misalignment (misalignment, target_alignment_c, &misalign))
1685 : : return DR_MISALIGNMENT_UNKNOWN;
1686 : 206 : return misalign;
1687 : : }
1688 : :
1689 : : /* Function vect_update_misalignment_for_peel.
1690 : : Sets DR_INFO's misalignment
1691 : : - to 0 if it has the same alignment as DR_PEEL_INFO,
1692 : : - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1693 : : - to -1 (unknown) otherwise.
1694 : :
1695 : : DR_INFO - the data reference whose misalignment is to be adjusted.
1696 : : DR_PEEL_INFO - the data reference whose misalignment is being made
1697 : : zero in the vector loop by the peel.
1698 : : NPEEL - the number of iterations in the peel loop if the misalignment
1699 : : of DR_PEEL_INFO is known at compile time. */
1700 : :
1701 : : static void
1702 : 3239 : vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1703 : : dr_vec_info *dr_peel_info, int npeel)
1704 : : {
1705 : : /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1706 : 3239 : if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1707 : : {
1708 : 452 : SET_DR_MISALIGNMENT (dr_info,
1709 : : vect_dr_misalign_for_aligned_access (dr_peel_info));
1710 : 452 : return;
1711 : : }
1712 : :
1713 : 2787 : unsigned HOST_WIDE_INT alignment;
1714 : 2787 : if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1715 : 2787 : && known_alignment_for_access_p (dr_info,
1716 : 2787 : STMT_VINFO_VECTYPE (dr_info->stmt))
1717 : 234 : && known_alignment_for_access_p (dr_peel_info,
1718 : 234 : STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1719 : : {
1720 : 186 : int misal = dr_info->misalignment;
1721 : 186 : misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1722 : 186 : misal &= alignment - 1;
1723 : 186 : set_dr_misalignment (dr_info, misal);
1724 : 186 : return;
1725 : : }
1726 : :
1727 : 2601 : if (dump_enabled_p ())
1728 : 35 : dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1729 : : "to unknown (-1).\n");
1730 : 2601 : SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1731 : : }
1732 : :
1733 : : /* Return true if alignment is relevant for DR_INFO. */
1734 : :
1735 : : static bool
1736 : 1458148 : vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1737 : : {
1738 : 1458148 : stmt_vec_info stmt_info = dr_info->stmt;
1739 : :
1740 : 1458148 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
1741 : : return false;
1742 : :
1743 : : /* For interleaving, only the alignment of the first access matters. */
1744 : 1457098 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1745 : 1668212 : && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1746 : : return false;
1747 : :
1748 : : /* Scatter-gather and invariant accesses continue to address individual
1749 : : scalars, so vector-level alignment is irrelevant. */
1750 : 1369389 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1751 : 1369389 : || integer_zerop (DR_STEP (dr_info->dr)))
1752 : 51998 : return false;
1753 : :
1754 : : /* Strided accesses perform only component accesses, alignment is
1755 : : irrelevant for them. */
1756 : 1317391 : if (STMT_VINFO_STRIDED_P (stmt_info)
1757 : 1317391 : && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1758 : : return false;
1759 : :
1760 : : return true;
1761 : : }
1762 : :
1763 : : /* Given an memory reference EXP return whether its alignment is less
1764 : : than its size. */
1765 : :
1766 : : static bool
1767 : 1288679 : not_size_aligned (tree exp)
1768 : : {
1769 : 1288679 : if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1770 : : return true;
1771 : :
1772 : 1288679 : return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1773 : 1288679 : > get_object_alignment (exp));
1774 : : }
1775 : :
1776 : : /* Function vector_alignment_reachable_p
1777 : :
1778 : : Return true if vector alignment for DR_INFO is reachable by peeling
1779 : : a few loop iterations. Return false otherwise. */
1780 : :
1781 : : static bool
1782 : 504898 : vector_alignment_reachable_p (dr_vec_info *dr_info, poly_uint64 vf)
1783 : : {
1784 : 504898 : stmt_vec_info stmt_info = dr_info->stmt;
1785 : 504898 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1786 : 504898 : poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1787 : 1009796 : poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1788 : 504898 : unsigned elem_size = vector_element_size (vector_size, nelements);
1789 : 504898 : unsigned group_size = 1;
1790 : :
1791 : 504898 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1792 : : {
1793 : : /* For interleaved access we peel only if number of iterations in
1794 : : the prolog loop ({VF - misalignment}), is a multiple of the
1795 : : number of the interleaved accesses. */
1796 : :
1797 : : /* FORNOW: handle only known alignment. */
1798 : 78248 : if (!known_alignment_for_access_p (dr_info, vectype))
1799 : 504898 : return false;
1800 : :
1801 : 44314 : unsigned mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1802 : 55520 : if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1803 : : return false;
1804 : :
1805 : 11206 : group_size = DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1806 : : }
1807 : :
1808 : : /* If the vectorization factor does not guarantee DR advancement of
1809 : : a multiple of the target alignment no peeling will help. */
1810 : 437856 : if (!multiple_p (elem_size * group_size * vf, dr_target_alignment (dr_info)))
1811 : 101 : return false;
1812 : :
1813 : : /* If misalignment is known at the compile time then allow peeling
1814 : : only if natural alignment is reachable through peeling. */
1815 : 437755 : if (known_alignment_for_access_p (dr_info, vectype)
1816 : 678031 : && !aligned_access_p (dr_info, vectype))
1817 : : {
1818 : 13604 : HOST_WIDE_INT elmsize =
1819 : 13604 : int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1820 : 13604 : if (dump_enabled_p ())
1821 : : {
1822 : 752 : dump_printf_loc (MSG_NOTE, vect_location,
1823 : : "data size = %wd. misalignment = %d.\n", elmsize,
1824 : : dr_misalignment (dr_info, vectype));
1825 : : }
1826 : 13604 : if (dr_misalignment (dr_info, vectype) % elmsize)
1827 : : {
1828 : 53 : if (dump_enabled_p ())
1829 : 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1830 : : "data size does not divide the misalignment.\n");
1831 : 53 : return false;
1832 : : }
1833 : : }
1834 : :
1835 : 437702 : if (!known_alignment_for_access_p (dr_info, vectype))
1836 : : {
1837 : 197479 : tree type = TREE_TYPE (DR_REF (dr_info->dr));
1838 : 197479 : bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1839 : 197479 : if (dump_enabled_p ())
1840 : 14681 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841 : : "Unknown misalignment, %snaturally aligned\n",
1842 : : is_packed ? "not " : "");
1843 : 197479 : return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1844 : : }
1845 : :
1846 : : return true;
1847 : : }
1848 : :
1849 : :
1850 : : /* Calculate the cost of the memory access represented by DR_INFO. */
1851 : :
1852 : : static void
1853 : 547794 : vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1854 : : dr_alignment_support alignment_support_scheme,
1855 : : int misalignment,
1856 : : unsigned int *inside_cost,
1857 : : unsigned int *outside_cost,
1858 : : stmt_vector_for_cost *body_cost_vec,
1859 : : stmt_vector_for_cost *prologue_cost_vec)
1860 : : {
1861 : 547794 : stmt_vec_info stmt_info = dr_info->stmt;
1862 : :
1863 : 547794 : if (DR_IS_READ (dr_info->dr))
1864 : 403525 : vect_get_load_cost (vinfo, stmt_info, NULL, 1,
1865 : : alignment_support_scheme, misalignment, true,
1866 : : inside_cost, outside_cost, prologue_cost_vec,
1867 : : body_cost_vec, false);
1868 : : else
1869 : 144269 : vect_get_store_cost (vinfo,stmt_info, NULL, 1,
1870 : : alignment_support_scheme, misalignment, inside_cost,
1871 : : body_cost_vec);
1872 : :
1873 : 547794 : if (dump_enabled_p ())
1874 : 27478 : dump_printf_loc (MSG_NOTE, vect_location,
1875 : : "vect_get_data_access_cost: inside_cost = %d, "
1876 : : "outside_cost = %d.\n", *inside_cost, *outside_cost);
1877 : 547794 : }
1878 : :
1879 : :
1880 : : typedef struct _vect_peel_info
1881 : : {
1882 : : dr_vec_info *dr_info;
1883 : : int npeel;
1884 : : unsigned int count;
1885 : : } *vect_peel_info;
1886 : :
1887 : : typedef struct _vect_peel_extended_info
1888 : : {
1889 : : vec_info *vinfo;
1890 : : struct _vect_peel_info peel_info;
1891 : : unsigned int inside_cost;
1892 : : unsigned int outside_cost;
1893 : : } *vect_peel_extended_info;
1894 : :
1895 : :
1896 : : /* Peeling hashtable helpers. */
1897 : :
1898 : : struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1899 : : {
1900 : : static inline hashval_t hash (const _vect_peel_info *);
1901 : : static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1902 : : };
1903 : :
1904 : : inline hashval_t
1905 : 653304 : peel_info_hasher::hash (const _vect_peel_info *peel_info)
1906 : : {
1907 : 653304 : return (hashval_t) peel_info->npeel;
1908 : : }
1909 : :
1910 : : inline bool
1911 : 355282 : peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1912 : : {
1913 : 355282 : return (a->npeel == b->npeel);
1914 : : }
1915 : :
1916 : :
1917 : : /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1918 : :
1919 : : static void
1920 : 298684 : vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1921 : : loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1922 : : int npeel, bool supportable_if_not_aligned)
1923 : : {
1924 : 298684 : struct _vect_peel_info elem, *slot;
1925 : 298684 : _vect_peel_info **new_slot;
1926 : :
1927 : 298684 : elem.npeel = npeel;
1928 : 298684 : slot = peeling_htab->find (&elem);
1929 : 298684 : if (slot)
1930 : 126428 : slot->count++;
1931 : : else
1932 : : {
1933 : 172256 : slot = XNEW (struct _vect_peel_info);
1934 : 172256 : slot->npeel = npeel;
1935 : 172256 : slot->dr_info = dr_info;
1936 : 172256 : slot->count = 1;
1937 : 172256 : new_slot = peeling_htab->find_slot (slot, INSERT);
1938 : 172256 : *new_slot = slot;
1939 : : }
1940 : :
1941 : : /* If this DR is not supported with unknown misalignment then bias
1942 : : this slot when the cost model is disabled. */
1943 : 298684 : if (!supportable_if_not_aligned
1944 : 298684 : && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1945 : 4720 : slot->count += VECT_MAX_COST;
1946 : 298684 : }
1947 : :
1948 : :
1949 : : /* Traverse peeling hash table to find peeling option that aligns maximum
1950 : : number of data accesses. */
1951 : :
1952 : : int
1953 : 35450 : vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1954 : : _vect_peel_extended_info *max)
1955 : : {
1956 : 35450 : vect_peel_info elem = *slot;
1957 : :
1958 : 35450 : if (elem->count > max->peel_info.count
1959 : 21566 : || (elem->count == max->peel_info.count
1960 : 16855 : && max->peel_info.npeel > elem->npeel))
1961 : : {
1962 : 13900 : max->peel_info.npeel = elem->npeel;
1963 : 13900 : max->peel_info.count = elem->count;
1964 : 13900 : max->peel_info.dr_info = elem->dr_info;
1965 : : }
1966 : :
1967 : 35450 : return 1;
1968 : : }
1969 : :
1970 : : /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1971 : : data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1972 : : npeel is computed at runtime but DR0_INFO's misalignment will be zero
1973 : : after peeling. */
1974 : :
1975 : : static void
1976 : 314583 : vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1977 : : dr_vec_info *dr0_info,
1978 : : unsigned int *inside_cost,
1979 : : unsigned int *outside_cost,
1980 : : stmt_vector_for_cost *body_cost_vec,
1981 : : stmt_vector_for_cost *prologue_cost_vec,
1982 : : unsigned int npeel)
1983 : : {
1984 : 314583 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1985 : :
1986 : 314583 : bool dr0_alignment_known_p
1987 : : = (dr0_info
1988 : 582766 : && known_alignment_for_access_p (dr0_info,
1989 : 268183 : STMT_VINFO_VECTYPE (dr0_info->stmt)));
1990 : :
1991 : 1525282 : for (data_reference *dr : datarefs)
1992 : : {
1993 : 581533 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1994 : 581533 : if (!vect_relevant_for_alignment_p (dr_info))
1995 : 33739 : continue;
1996 : :
1997 : 547794 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1998 : 547794 : dr_alignment_support alignment_support_scheme;
1999 : 547794 : int misalignment;
2000 : 547794 : unsigned HOST_WIDE_INT alignment;
2001 : :
2002 : 547794 : bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
2003 : 547794 : size_zero_node) < 0;
2004 : 547794 : poly_int64 off = 0;
2005 : 547794 : if (negative)
2006 : 22022 : off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2007 : 22022 : * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2008 : :
2009 : 547794 : if (npeel == 0)
2010 : 267456 : misalignment = dr_misalignment (dr_info, vectype, off);
2011 : 280338 : else if (dr_info == dr0_info
2012 : 280338 : || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
2013 : : misalignment = 0;
2014 : 95619 : else if (!dr0_alignment_known_p
2015 : 7419 : || !known_alignment_for_access_p (dr_info, vectype)
2016 : 103038 : || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
2017 : : misalignment = DR_MISALIGNMENT_UNKNOWN;
2018 : : else
2019 : : {
2020 : 6423 : misalignment = dr_misalignment (dr_info, vectype, off);
2021 : 6423 : misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
2022 : 6423 : misalignment &= alignment - 1;
2023 : : }
2024 : 547794 : alignment_support_scheme
2025 : 547794 : = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2026 : : misalignment);
2027 : :
2028 : 547794 : vect_get_data_access_cost (loop_vinfo, dr_info,
2029 : : alignment_support_scheme, misalignment,
2030 : : inside_cost, outside_cost,
2031 : : body_cost_vec, prologue_cost_vec);
2032 : : }
2033 : 314583 : }
2034 : :
2035 : : /* Traverse peeling hash table and calculate cost for each peeling option.
2036 : : Find the one with the lowest cost. */
2037 : :
2038 : : int
2039 : 117327 : vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
2040 : : _vect_peel_extended_info *min)
2041 : : {
2042 : 117327 : vect_peel_info elem = *slot;
2043 : 117327 : int dummy;
2044 : 117327 : unsigned int inside_cost = 0, outside_cost = 0;
2045 : 117327 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
2046 : 117327 : stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
2047 : : epilogue_cost_vec;
2048 : :
2049 : 117327 : prologue_cost_vec.create (2);
2050 : 117327 : body_cost_vec.create (2);
2051 : 117327 : epilogue_cost_vec.create (2);
2052 : :
2053 : 117327 : vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
2054 : : &outside_cost, &body_cost_vec,
2055 : 117327 : &prologue_cost_vec, elem->npeel);
2056 : :
2057 : 117327 : body_cost_vec.release ();
2058 : :
2059 : 234654 : outside_cost += vect_get_known_peeling_cost
2060 : 117327 : (loop_vinfo, elem->npeel, &dummy,
2061 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2062 : : &prologue_cost_vec, &epilogue_cost_vec);
2063 : :
2064 : : /* Prologue and epilogue costs are added to the target model later.
2065 : : These costs depend only on the scalar iteration cost, the
2066 : : number of peeling iterations finally chosen, and the number of
2067 : : misaligned statements. So discard the information found here. */
2068 : 117327 : prologue_cost_vec.release ();
2069 : 117327 : epilogue_cost_vec.release ();
2070 : :
2071 : 117327 : if (inside_cost < min->inside_cost
2072 : 1453 : || (inside_cost == min->inside_cost
2073 : 1063 : && outside_cost < min->outside_cost))
2074 : : {
2075 : 115880 : min->inside_cost = inside_cost;
2076 : 115880 : min->outside_cost = outside_cost;
2077 : 115880 : min->peel_info.dr_info = elem->dr_info;
2078 : 115880 : min->peel_info.npeel = elem->npeel;
2079 : 115880 : min->peel_info.count = elem->count;
2080 : : }
2081 : :
2082 : 117327 : return 1;
2083 : : }
2084 : :
2085 : :
2086 : : /* Choose best peeling option by traversing peeling hash table and either
2087 : : choosing an option with the lowest cost (if cost model is enabled) or the
2088 : : option that aligns as many accesses as possible. */
2089 : :
2090 : : static struct _vect_peel_extended_info
2091 : 128445 : vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
2092 : : loop_vec_info loop_vinfo)
2093 : : {
2094 : 128445 : struct _vect_peel_extended_info res;
2095 : :
2096 : 128445 : res.peel_info.dr_info = NULL;
2097 : 128445 : res.vinfo = loop_vinfo;
2098 : :
2099 : 128445 : if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2100 : : {
2101 : 114614 : res.inside_cost = INT_MAX;
2102 : 114614 : res.outside_cost = INT_MAX;
2103 : 114614 : peeling_htab->traverse <_vect_peel_extended_info *,
2104 : 231941 : vect_peeling_hash_get_lowest_cost> (&res);
2105 : : }
2106 : : else
2107 : : {
2108 : 13831 : res.peel_info.count = 0;
2109 : 13831 : peeling_htab->traverse <_vect_peel_extended_info *,
2110 : 49281 : vect_peeling_hash_get_most_frequent> (&res);
2111 : 13831 : res.inside_cost = 0;
2112 : 13831 : res.outside_cost = 0;
2113 : : }
2114 : :
2115 : 128445 : return res;
2116 : : }
2117 : :
2118 : : /* Return if vectorization is definitely, possibly, or unlikely to be
2119 : : supportable after loop peeling. */
2120 : :
2121 : : static enum peeling_support
2122 : 80874 : vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
2123 : : unsigned npeel)
2124 : : {
2125 : 80874 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2126 : 80874 : enum dr_alignment_support supportable_dr_alignment;
2127 : :
2128 : 80874 : bool dr0_alignment_known_p
2129 : 161748 : = known_alignment_for_access_p (dr0_info,
2130 : 80874 : STMT_VINFO_VECTYPE (dr0_info->stmt));
2131 : 80874 : bool has_unsupported_dr_p = false;
2132 : 80874 : unsigned int dr0_step = tree_to_shwi (DR_STEP (dr0_info->dr));
2133 : 80874 : int known_unsupported_misalignment = DR_MISALIGNMENT_UNKNOWN;
2134 : :
2135 : : /* Check if each data ref can be vectorized after peeling. */
2136 : 345737 : for (data_reference *dr : datarefs)
2137 : : {
2138 : 119088 : if (dr == dr0_info->dr)
2139 : 79901 : continue;
2140 : :
2141 : 39187 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2142 : 39187 : if (!vect_relevant_for_alignment_p (dr_info)
2143 : 39187 : || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
2144 : 6551 : continue;
2145 : :
2146 : 32636 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2147 : 32636 : int misalignment;
2148 : 32636 : unsigned HOST_WIDE_INT alignment;
2149 : 32636 : if (!dr0_alignment_known_p
2150 : 1794 : || !known_alignment_for_access_p (dr_info, vectype)
2151 : 34430 : || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
2152 : : misalignment = DR_MISALIGNMENT_UNKNOWN;
2153 : : else
2154 : : {
2155 : 1780 : misalignment = dr_misalignment (dr_info, vectype);
2156 : 1780 : misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
2157 : 1780 : misalignment &= alignment - 1;
2158 : : }
2159 : 32636 : supportable_dr_alignment
2160 : 32636 : = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2161 : : misalignment);
2162 : 32636 : if (supportable_dr_alignment == dr_unaligned_unsupported)
2163 : : {
2164 : 31129 : has_unsupported_dr_p = true;
2165 : :
2166 : : /* If unaligned unsupported DRs exist, we do following checks to see
2167 : : if they can be mutually aligned to support vectorization. If yes,
2168 : : we can try peeling and create a runtime (mutual alignment) check
2169 : : to guard the peeled loop. If no, return PEELING_UNSUPPORTED. */
2170 : :
2171 : : /* 1) If unaligned unsupported DRs have different alignment steps, the
2172 : : probability of DRs being mutually aligned is very low, and it's
2173 : : quite complex to check mutual alignment at runtime. We return
2174 : : PEELING_UNSUPPORTED in this case. */
2175 : 31129 : if (tree_to_shwi (DR_STEP (dr)) != dr0_step)
2176 : 80874 : return peeling_unsupported;
2177 : :
2178 : : /* 2) Based on above same alignment step condition, if one known
2179 : : misaligned DR has zero misalignment, or different misalignment
2180 : : amount from another known misaligned DR, peeling is unable to
2181 : : help make all these DRs aligned together. We won't try peeling
2182 : : with versioning anymore. */
2183 : 26787 : int curr_dr_misalignment = dr_misalignment (dr_info, vectype);
2184 : 26787 : if (curr_dr_misalignment == 0)
2185 : : return peeling_unsupported;
2186 : 15156 : if (known_unsupported_misalignment != DR_MISALIGNMENT_UNKNOWN)
2187 : : {
2188 : 8 : if (curr_dr_misalignment != DR_MISALIGNMENT_UNKNOWN
2189 : 8 : && curr_dr_misalignment != known_unsupported_misalignment)
2190 : : return peeling_unsupported;
2191 : : }
2192 : : else
2193 : : known_unsupported_misalignment = curr_dr_misalignment;
2194 : : }
2195 : : }
2196 : :
2197 : : /* Vectorization is known to be supportable with peeling alone when there is
2198 : : no unsupported DR. */
2199 : 64901 : return has_unsupported_dr_p ? peeling_maybe_supported
2200 : : : peeling_known_supported;
2201 : : }
2202 : :
2203 : : /* Compare two data-references DRA and DRB to group them into chunks
2204 : : with related alignment. */
2205 : :
2206 : : static int
2207 : 3721876 : dr_align_group_sort_cmp (const void *dra_, const void *drb_)
2208 : : {
2209 : 3721876 : data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2210 : 3721876 : data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2211 : 3721876 : int cmp;
2212 : :
2213 : : /* Stabilize sort. */
2214 : 3721876 : if (dra == drb)
2215 : : return 0;
2216 : :
2217 : : /* Ordering of DRs according to base. */
2218 : 3721876 : cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2219 : : DR_BASE_ADDRESS (drb));
2220 : 3721876 : if (cmp != 0)
2221 : : return cmp;
2222 : :
2223 : : /* And according to DR_OFFSET. */
2224 : 1643637 : cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2225 : 1643637 : if (cmp != 0)
2226 : : return cmp;
2227 : :
2228 : : /* And after step. */
2229 : 1631559 : cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2230 : 1631559 : if (cmp != 0)
2231 : : return cmp;
2232 : :
2233 : : /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2234 : 1627060 : cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2235 : 1627060 : if (cmp == 0)
2236 : 176083 : return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2237 : : return cmp;
2238 : : }
2239 : :
2240 : : /* Function vect_enhance_data_refs_alignment
2241 : :
2242 : : This pass will use loop versioning and loop peeling in order to enhance
2243 : : the alignment of data references in the loop.
2244 : :
2245 : : FOR NOW: we assume that whatever versioning/peeling takes place, only the
2246 : : original loop is to be vectorized. Any other loops that are created by
2247 : : the transformations performed in this pass - are not supposed to be
2248 : : vectorized. This restriction will be relaxed.
2249 : :
2250 : : This pass will require a cost model to guide it whether to apply peeling
2251 : : or versioning or a combination of the two. For example, the scheme that
2252 : : intel uses when given a loop with several memory accesses, is as follows:
2253 : : choose one memory access ('p') which alignment you want to force by doing
2254 : : peeling. Then, either (1) generate a loop in which 'p' is aligned and all
2255 : : other accesses are not necessarily aligned, or (2) use loop versioning to
2256 : : generate one loop in which all accesses are aligned, and another loop in
2257 : : which only 'p' is necessarily aligned.
2258 : :
2259 : : ("Automatic Intra-Register Vectorization for the Intel Architecture",
2260 : : Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2261 : : Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2262 : :
2263 : : Devising a cost model is the most critical aspect of this work. It will
2264 : : guide us on which access to peel for, whether to use loop versioning, how
2265 : : many versions to create, etc. The cost model will probably consist of
2266 : : generic considerations as well as target specific considerations (on
2267 : : powerpc for example, misaligned stores are more painful than misaligned
2268 : : loads).
2269 : :
2270 : : Here are the general steps involved in alignment enhancements:
2271 : :
2272 : : -- original loop, before alignment analysis:
2273 : : for (i=0; i<N; i++){
2274 : : x = q[i]; # DR_MISALIGNMENT(q) = unknown
2275 : : p[i] = y; # DR_MISALIGNMENT(p) = unknown
2276 : : }
2277 : :
2278 : : -- After vect_compute_data_refs_alignment:
2279 : : for (i=0; i<N; i++){
2280 : : x = q[i]; # DR_MISALIGNMENT(q) = 3
2281 : : p[i] = y; # DR_MISALIGNMENT(p) = unknown
2282 : : }
2283 : :
2284 : : -- Possibility 1: we do loop versioning:
2285 : : if (p is aligned) {
2286 : : for (i=0; i<N; i++){ # loop 1A
2287 : : x = q[i]; # DR_MISALIGNMENT(q) = 3
2288 : : p[i] = y; # DR_MISALIGNMENT(p) = 0
2289 : : }
2290 : : }
2291 : : else {
2292 : : for (i=0; i<N; i++){ # loop 1B
2293 : : x = q[i]; # DR_MISALIGNMENT(q) = 3
2294 : : p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2295 : : }
2296 : : }
2297 : :
2298 : : -- Possibility 2: we do loop peeling:
2299 : : for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2300 : : x = q[i];
2301 : : p[i] = y;
2302 : : }
2303 : : for (i = 3; i < N; i++){ # loop 2A
2304 : : x = q[i]; # DR_MISALIGNMENT(q) = 0
2305 : : p[i] = y; # DR_MISALIGNMENT(p) = unknown
2306 : : }
2307 : :
2308 : : -- Possibility 3: combination of loop peeling and versioning:
2309 : : if (p & q are mutually aligned) {
2310 : : for (i=0; i<3; i++){ # (peeled loop iterations).
2311 : : x = q[i];
2312 : : p[i] = y;
2313 : : }
2314 : : for (i=3; i<N; i++){ # loop 3A
2315 : : x = q[i]; # DR_MISALIGNMENT(q) = 0
2316 : : p[i] = y; # DR_MISALIGNMENT(p) = 0
2317 : : }
2318 : : }
2319 : : else {
2320 : : for (i=0; i<N; i++){ # (scalar loop, not to be vectorized).
2321 : : x = q[i]; # DR_MISALIGNMENT(q) = 3
2322 : : p[i] = y; # DR_MISALIGNMENT(p) = unknown
2323 : : }
2324 : : }
2325 : :
2326 : : These loops are later passed to loop_transform to be vectorized. The
2327 : : vectorizer will use the alignment information to guide the transformation
2328 : : (whether to generate regular loads/stores, or with special handling for
2329 : : misalignment). */
2330 : :
2331 : : opt_result
2332 : 333864 : vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2333 : : {
2334 : 333864 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2335 : 333864 : dr_vec_info *first_store = NULL;
2336 : 333864 : dr_vec_info *dr0_info = NULL;
2337 : 333864 : struct data_reference *dr;
2338 : 333864 : unsigned int i;
2339 : 333864 : bool do_peeling = false;
2340 : 333864 : bool do_versioning = false;
2341 : 333864 : bool try_peeling_with_versioning = false;
2342 : 333864 : unsigned int npeel = 0;
2343 : 333864 : bool one_misalignment_known = false;
2344 : 333864 : bool one_misalignment_unknown = false;
2345 : 333864 : bool one_dr_unsupportable = false;
2346 : 333864 : dr_vec_info *unsupportable_dr_info = NULL;
2347 : 333864 : unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2348 : 333864 : hash_table<peel_info_hasher> peeling_htab (1);
2349 : :
2350 : 333864 : DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2351 : :
2352 : : /* Reset data so we can safely be called multiple times. */
2353 : 333864 : LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2354 : 333864 : LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2355 : :
2356 : 333864 : if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2357 : 13251 : return opt_result::success ();
2358 : :
2359 : : /* Sort the vector of datarefs so DRs that have the same or dependent
2360 : : alignment are next to each other. */
2361 : 320613 : auto_vec<data_reference_p> datarefs
2362 : 320613 : = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2363 : 320613 : datarefs.qsort (dr_align_group_sort_cmp);
2364 : :
2365 : : /* Compute the number of DRs that become aligned when we peel
2366 : : a dataref so it becomes aligned. */
2367 : 641226 : auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2368 : 320613 : n_same_align_refs.quick_grow_cleared (datarefs.length ());
2369 : 320613 : unsigned i0;
2370 : 660285 : for (i0 = 0; i0 < datarefs.length (); ++i0)
2371 : 333043 : if (DR_BASE_ADDRESS (datarefs[i0]))
2372 : : break;
2373 : 2045040 : for (i = i0 + 1; i <= datarefs.length (); ++i)
2374 : : {
2375 : 701907 : if (i == datarefs.length ()
2376 : 387923 : || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2377 : 387923 : DR_BASE_ADDRESS (datarefs[i]), 0)
2378 : 187015 : || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2379 : 187015 : DR_OFFSET (datarefs[i]), 0)
2380 : 887730 : || !operand_equal_p (DR_STEP (datarefs[i0]),
2381 : 185823 : DR_STEP (datarefs[i]), 0))
2382 : : {
2383 : : /* The subgroup [i0, i-1] now only differs in DR_INIT and
2384 : : possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
2385 : : will get known misalignment if we align one of the refs
2386 : : with the largest DR_TARGET_ALIGNMENT. */
2387 : 1218473 : for (unsigned j = i0; j < i; ++j)
2388 : : {
2389 : 701907 : dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2390 : 2832336 : for (unsigned k = i0; k < i; ++k)
2391 : : {
2392 : 2130429 : if (k == j)
2393 : 701907 : continue;
2394 : 1428522 : dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2395 : 1428522 : if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2396 : : dr_infoj))
2397 : 454833 : n_same_align_refs[j]++;
2398 : : }
2399 : : }
2400 : : i0 = i;
2401 : : }
2402 : : }
2403 : :
2404 : : /* While cost model enhancements are expected in the future, the high level
2405 : : view of the code at this time is as follows:
2406 : :
2407 : : A) If there is a misaligned access then see if doing peeling alone can
2408 : : make all data references satisfy vect_supportable_dr_alignment. If so,
2409 : : update data structures and return.
2410 : :
2411 : : B) If peeling alone wasn't possible and there is a data reference with an
2412 : : unknown misalignment that does not satisfy vect_supportable_dr_alignment
2413 : : then we may use either of the following two approaches.
2414 : :
2415 : : B1) Try peeling with versioning: Add a runtime loop versioning check to
2416 : : see if all unsupportable data references are mutually aligned, which
2417 : : means they will be uniformly aligned after a certain amount of loop
2418 : : peeling. If peeling and versioning can be used together, set
2419 : : LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT_P to TRUE and return.
2420 : :
2421 : : B2) Try versioning alone: Add a runtime loop versioning check to see if
2422 : : all unsupportable data references are already uniformly aligned
2423 : : without loop peeling. If versioning can be applied alone, set
2424 : : LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT_P to FALSE and return.
2425 : :
2426 : : Above B1 is more powerful and more likely to be adopted than B2. But B2
2427 : : is still available and useful in some cases, for example, the cost model
2428 : : does not allow much peeling.
2429 : :
2430 : : C) If none of above was successful then the alignment was not enhanced,
2431 : : just return. */
2432 : :
2433 : : /* (1) Peeling to force alignment. */
2434 : :
2435 : : /* (1.1) Decide whether to perform peeling, how many iterations to peel, and
2436 : : if vectorization may be supported by peeling with versioning.
2437 : : Considerations:
2438 : : - How many accesses will become aligned due to the peeling
2439 : : - How many accesses will become unaligned due to the peeling,
2440 : : and the cost of misaligned accesses.
2441 : : - The cost of peeling (the extra runtime checks, the increase
2442 : : in code size). */
2443 : :
2444 : 320613 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2445 : 885816 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2446 : : {
2447 : 608905 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2448 : 608905 : if (!vect_relevant_for_alignment_p (dr_info))
2449 : 104007 : continue;
2450 : :
2451 : 504898 : stmt_vec_info stmt_info = dr_info->stmt;
2452 : 504898 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2453 : :
2454 : : /* With variable VF, unsafe speculative read can be avoided for known
2455 : : inbounds DRs as long as partial vectors are used. */
2456 : 504898 : if (!vf.is_constant ()
2457 : : && dr_safe_speculative_read_required (stmt_info)
2458 : : && DR_SCALAR_KNOWN_BOUNDS (dr_info))
2459 : : {
2460 : : dr_set_safe_speculative_read_required (stmt_info, false);
2461 : : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
2462 : : }
2463 : :
2464 : 504898 : do_peeling = vector_alignment_reachable_p (dr_info, vf);
2465 : 504898 : if (do_peeling)
2466 : : {
2467 : 435631 : if (known_alignment_for_access_p (dr_info, vectype))
2468 : : {
2469 : 240223 : unsigned int npeel_tmp = 0;
2470 : 240223 : bool negative = tree_int_cst_compare (DR_STEP (dr),
2471 : 240223 : size_zero_node) < 0;
2472 : :
2473 : : /* If known_alignment_for_access_p then we have set
2474 : : DR_MISALIGNMENT which is only done if we know it at compiler
2475 : : time, so it is safe to assume target alignment is constant.
2476 : : */
2477 : 240223 : unsigned int target_align =
2478 : 240223 : DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2479 : 240223 : unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2480 : 240223 : poly_int64 off = 0;
2481 : 240223 : if (negative)
2482 : 2374 : off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2483 : 240223 : unsigned int mis = dr_misalignment (dr_info, vectype, off);
2484 : 240223 : mis = negative ? mis : -mis;
2485 : 240223 : if (mis != 0)
2486 : 12578 : npeel_tmp = (mis & (target_align - 1)) / dr_size;
2487 : :
2488 : : /* For multiple types, it is possible that the bigger type access
2489 : : will have more than one peeling option. E.g., a loop with two
2490 : : types: one of size (vector size / 4), and the other one of
2491 : : size (vector size / 8). Vectorization factor will 8. If both
2492 : : accesses are misaligned by 3, the first one needs one scalar
2493 : : iteration to be aligned, and the second one needs 5. But the
2494 : : first one will be aligned also by peeling 5 scalar
2495 : : iterations, and in that case both accesses will be aligned.
2496 : : Hence, except for the immediate peeling amount, we also want
2497 : : to try to add full vector size, while we don't exceed
2498 : : vectorization factor.
2499 : : We do this automatically for cost model, since we calculate
2500 : : cost for every peeling option. */
2501 : 240223 : poly_uint64 nscalars = npeel_tmp;
2502 : 240223 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2503 : : {
2504 : 39461 : unsigned group_size = 1;
2505 : 39461 : if (STMT_SLP_TYPE (stmt_info)
2506 : 39461 : && STMT_VINFO_GROUPED_ACCESS (stmt_info))
2507 : 1889 : group_size = DR_GROUP_SIZE (stmt_info);
2508 : 39461 : nscalars = vf * group_size;
2509 : : }
2510 : :
2511 : : /* Save info about DR in the hash table. Also include peeling
2512 : : amounts according to the explanation above. Indicate
2513 : : the alignment status when the ref is not aligned.
2514 : : ??? Rather than using unknown alignment here we should
2515 : : prune all entries from the peeling hashtable which cause
2516 : : DRs to be not supported. */
2517 : 240223 : bool supportable_if_not_aligned
2518 : : = vect_supportable_dr_alignment
2519 : 240223 : (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2520 : 538907 : while (known_le (npeel_tmp, nscalars))
2521 : : {
2522 : 298684 : vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2523 : : dr_info, npeel_tmp,
2524 : : supportable_if_not_aligned);
2525 : 298684 : npeel_tmp += MAX (1, target_align / dr_size);
2526 : : }
2527 : :
2528 : 240223 : one_misalignment_known = true;
2529 : : }
2530 : : else
2531 : : {
2532 : : /* If we don't know any misalignment values, we prefer
2533 : : peeling for data-ref that has the maximum number of data-refs
2534 : : with the same alignment, unless the target prefers to align
2535 : : stores over load. */
2536 : 195408 : unsigned same_align_drs = n_same_align_refs[i];
2537 : 195408 : if (!dr0_info
2538 : 195408 : || dr0_same_align_drs < same_align_drs)
2539 : : {
2540 : : dr0_same_align_drs = same_align_drs;
2541 : : dr0_info = dr_info;
2542 : : }
2543 : : /* For data-refs with the same number of related
2544 : : accesses prefer the one where the misalign
2545 : : computation will be invariant in the outermost loop. */
2546 : 57665 : else if (dr0_same_align_drs == same_align_drs)
2547 : : {
2548 : 56719 : class loop *ivloop0, *ivloop;
2549 : 56719 : ivloop0 = outermost_invariant_loop_for_expr
2550 : 56719 : (loop, DR_BASE_ADDRESS (dr0_info->dr));
2551 : 56719 : ivloop = outermost_invariant_loop_for_expr
2552 : 56719 : (loop, DR_BASE_ADDRESS (dr));
2553 : 56719 : if ((ivloop && !ivloop0)
2554 : 56719 : || (ivloop && ivloop0
2555 : 56713 : && flow_loop_nested_p (ivloop, ivloop0)))
2556 : : dr0_info = dr_info;
2557 : : }
2558 : :
2559 : 195408 : one_misalignment_unknown = true;
2560 : :
2561 : : /* Check for data refs with unsupportable alignment that
2562 : : can be peeled. */
2563 : 195408 : enum dr_alignment_support supportable_dr_alignment
2564 : 195408 : = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2565 : : DR_MISALIGNMENT_UNKNOWN);
2566 : 195408 : if (supportable_dr_alignment == dr_unaligned_unsupported)
2567 : : {
2568 : 100377 : one_dr_unsupportable = true;
2569 : 100377 : unsupportable_dr_info = dr_info;
2570 : : }
2571 : :
2572 : 195408 : if (!first_store && DR_IS_WRITE (dr))
2573 : : {
2574 : 38512 : first_store = dr_info;
2575 : 38512 : first_store_same_align_drs = same_align_drs;
2576 : : }
2577 : : }
2578 : : }
2579 : : else
2580 : : {
2581 : 69267 : if (!aligned_access_p (dr_info, vectype))
2582 : : {
2583 : 43702 : if (dump_enabled_p ())
2584 : 1944 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2585 : : "vector alignment may not be reachable\n");
2586 : : break;
2587 : : }
2588 : : }
2589 : : }
2590 : :
2591 : : /* Check if we can possibly peel the loop. */
2592 : 320613 : if (!vect_can_advance_ivs_p (loop_vinfo)
2593 : 315545 : || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_MAIN_EXIT (loop_vinfo),
2594 : 315545 : loop_preheader_edge (loop))
2595 : 315545 : || loop->inner
2596 : : /* We don't currently maintaing the LCSSA for prologue peeled inversed
2597 : : loops. */
2598 : 634764 : || (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
2599 : 30342 : && !LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo)))
2600 : : do_peeling = false;
2601 : :
2602 : 320613 : struct _vect_peel_extended_info peel_for_known_alignment;
2603 : 320613 : struct _vect_peel_extended_info peel_for_unknown_alignment;
2604 : 320613 : struct _vect_peel_extended_info best_peel;
2605 : :
2606 : 320613 : peel_for_unknown_alignment.inside_cost = INT_MAX;
2607 : 320613 : peel_for_unknown_alignment.outside_cost = INT_MAX;
2608 : 320613 : peel_for_unknown_alignment.peel_info.count = 0;
2609 : :
2610 : 320613 : if (do_peeling
2611 : 320613 : && one_misalignment_unknown)
2612 : : {
2613 : : /* Check if the target requires to prefer stores over loads, i.e., if
2614 : : misaligned stores are more expensive than misaligned loads (taking
2615 : : drs with same alignment into account). */
2616 : 120777 : unsigned int load_inside_cost = 0;
2617 : 120777 : unsigned int load_outside_cost = 0;
2618 : 120777 : unsigned int store_inside_cost = 0;
2619 : 120777 : unsigned int store_outside_cost = 0;
2620 : 120777 : unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2621 : :
2622 : 120777 : stmt_vector_for_cost dummy;
2623 : 120777 : dummy.create (2);
2624 : 120777 : vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2625 : : &load_inside_cost,
2626 : : &load_outside_cost,
2627 : : &dummy, &dummy, estimated_npeels);
2628 : 120777 : dummy.release ();
2629 : :
2630 : 120777 : if (first_store)
2631 : : {
2632 : 30079 : dummy.create (2);
2633 : 30079 : vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2634 : : &store_inside_cost,
2635 : : &store_outside_cost,
2636 : : &dummy, &dummy,
2637 : : estimated_npeels);
2638 : 30079 : dummy.release ();
2639 : : }
2640 : : else
2641 : : {
2642 : 90698 : store_inside_cost = INT_MAX;
2643 : 90698 : store_outside_cost = INT_MAX;
2644 : : }
2645 : :
2646 : 120777 : if (load_inside_cost > store_inside_cost
2647 : 120777 : || (load_inside_cost == store_inside_cost
2648 : 29466 : && load_outside_cost > store_outside_cost))
2649 : : {
2650 : 120777 : dr0_info = first_store;
2651 : 120777 : dr0_same_align_drs = first_store_same_align_drs;
2652 : 120777 : peel_for_unknown_alignment.inside_cost = store_inside_cost;
2653 : 120777 : peel_for_unknown_alignment.outside_cost = store_outside_cost;
2654 : : }
2655 : : else
2656 : : {
2657 : 120777 : peel_for_unknown_alignment.inside_cost = load_inside_cost;
2658 : 120777 : peel_for_unknown_alignment.outside_cost = load_outside_cost;
2659 : : }
2660 : :
2661 : 120777 : stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2662 : 120777 : prologue_cost_vec.create (2);
2663 : 120777 : epilogue_cost_vec.create (2);
2664 : :
2665 : 120777 : int dummy2;
2666 : 241554 : peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2667 : 120777 : (loop_vinfo, estimated_npeels, &dummy2,
2668 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2669 : : &prologue_cost_vec, &epilogue_cost_vec);
2670 : :
2671 : 120777 : prologue_cost_vec.release ();
2672 : 120777 : epilogue_cost_vec.release ();
2673 : :
2674 : 120777 : peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2675 : : }
2676 : :
2677 : 320613 : peel_for_unknown_alignment.peel_info.npeel = 0;
2678 : 320613 : peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2679 : :
2680 : 320613 : best_peel = peel_for_unknown_alignment;
2681 : :
2682 : 320613 : peel_for_known_alignment.inside_cost = INT_MAX;
2683 : 320613 : peel_for_known_alignment.outside_cost = INT_MAX;
2684 : 320613 : peel_for_known_alignment.peel_info.count = 0;
2685 : 320613 : peel_for_known_alignment.peel_info.dr_info = NULL;
2686 : :
2687 : 320613 : if (do_peeling && one_misalignment_known)
2688 : : {
2689 : : /* Peeling is possible, but there is no data access that is not supported
2690 : : unless aligned. So we try to choose the best possible peeling from
2691 : : the hash table. */
2692 : 128445 : peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2693 : 128445 : (&peeling_htab, loop_vinfo);
2694 : : }
2695 : :
2696 : : /* Compare costs of peeling for known and unknown alignment. */
2697 : 320613 : if (peel_for_known_alignment.peel_info.dr_info != NULL
2698 : 128445 : && peel_for_unknown_alignment.inside_cost
2699 : : >= peel_for_known_alignment.inside_cost)
2700 : : {
2701 : 114112 : best_peel = peel_for_known_alignment;
2702 : :
2703 : : /* If the best peeling for known alignment has NPEEL == 0, perform no
2704 : : peeling at all except if there is an unsupportable dr that we can
2705 : : align. */
2706 : 114112 : if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2707 : : do_peeling = false;
2708 : : }
2709 : :
2710 : : /* If there is an unsupportable data ref, prefer this over all choices so far
2711 : : since we'd have to discard a chosen peeling except when it accidentally
2712 : : aligned the unsupportable data ref. */
2713 : 214001 : if (one_dr_unsupportable)
2714 : : dr0_info = unsupportable_dr_info;
2715 : 238647 : else if (do_peeling)
2716 : : {
2717 : : /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2718 : : TODO: Use nopeel_outside_cost or get rid of it? */
2719 : 46400 : unsigned nopeel_inside_cost = 0;
2720 : 46400 : unsigned nopeel_outside_cost = 0;
2721 : :
2722 : 46400 : stmt_vector_for_cost dummy;
2723 : 46400 : dummy.create (2);
2724 : 46400 : vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2725 : : &nopeel_outside_cost, &dummy, &dummy, 0);
2726 : 46400 : dummy.release ();
2727 : :
2728 : : /* Add epilogue costs. As we do not peel for alignment here, no prologue
2729 : : costs will be recorded. */
2730 : 46400 : stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2731 : 46400 : prologue_cost_vec.create (2);
2732 : 46400 : epilogue_cost_vec.create (2);
2733 : :
2734 : 46400 : int dummy2;
2735 : 92800 : nopeel_outside_cost += vect_get_known_peeling_cost
2736 : 46400 : (loop_vinfo, 0, &dummy2,
2737 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2738 : : &prologue_cost_vec, &epilogue_cost_vec);
2739 : :
2740 : 46400 : prologue_cost_vec.release ();
2741 : 46400 : epilogue_cost_vec.release ();
2742 : :
2743 : 46400 : npeel = best_peel.peel_info.npeel;
2744 : 46400 : dr0_info = best_peel.peel_info.dr_info;
2745 : :
2746 : : /* If no peeling is not more expensive than the best peeling we
2747 : : have so far, don't perform any peeling. */
2748 : 46400 : if (nopeel_inside_cost <= best_peel.inside_cost)
2749 : 39863 : do_peeling = false;
2750 : : }
2751 : :
2752 : 128366 : if (do_peeling)
2753 : : {
2754 : 80874 : stmt_vec_info stmt_info = dr0_info->stmt;
2755 : 80874 : if (known_alignment_for_access_p (dr0_info,
2756 : : STMT_VINFO_VECTYPE (stmt_info)))
2757 : : {
2758 : 6521 : bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2759 : 6521 : size_zero_node) < 0;
2760 : 6521 : if (!npeel)
2761 : : {
2762 : : /* Since it's known at compile time, compute the number of
2763 : : iterations in the peeled loop (the peeling factor) for use in
2764 : : updating DR_MISALIGNMENT values. The peeling factor is the
2765 : : vectorization factor minus the misalignment as an element
2766 : : count. */
2767 : 0 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2768 : 0 : poly_int64 off = 0;
2769 : 0 : if (negative)
2770 : 0 : off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2771 : 0 : * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2772 : 0 : unsigned int mis
2773 : 0 : = dr_misalignment (dr0_info, vectype, off);
2774 : 0 : mis = negative ? mis : -mis;
2775 : : /* If known_alignment_for_access_p then we have set
2776 : : DR_MISALIGNMENT which is only done if we know it at compiler
2777 : : time, so it is safe to assume target alignment is constant.
2778 : : */
2779 : 0 : unsigned int target_align =
2780 : 0 : DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2781 : 0 : npeel = ((mis & (target_align - 1))
2782 : 0 : / vect_get_scalar_dr_size (dr0_info));
2783 : : }
2784 : :
2785 : : /* For interleaved data access every iteration accesses all the
2786 : : members of the group, therefore we divide the number of iterations
2787 : : by the group size. */
2788 : 6521 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2789 : 281 : npeel /= DR_GROUP_SIZE (stmt_info);
2790 : :
2791 : 6521 : if (dump_enabled_p ())
2792 : 268 : dump_printf_loc (MSG_NOTE, vect_location,
2793 : : "Try peeling by %d\n", npeel);
2794 : : }
2795 : :
2796 : : /* Check how peeling for alignment can support vectorization. Function
2797 : : vect_peeling_supportable returns one of the three possible values:
2798 : : - PEELING_KNOWN_SUPPORTED: indicates that we know all unsupported
2799 : : datarefs can be aligned after peeling. We can use peeling alone.
2800 : : - PEELING_MAYBE_SUPPORTED: indicates that peeling may be able to make
2801 : : these datarefs aligned but we are not sure about it at compile time.
2802 : : We will try peeling with versioning to add a runtime check to guard
2803 : : the peeled loop.
2804 : : - PEELING_UNSUPPORTED: indicates that peeling is almost impossible to
2805 : : support vectorization. We will stop trying peeling. */
2806 : 80874 : switch (vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2807 : : {
2808 : : case peeling_known_supported:
2809 : : break;
2810 : 13958 : case peeling_maybe_supported:
2811 : 13958 : try_peeling_with_versioning = true;
2812 : 13958 : break;
2813 : 15973 : case peeling_unsupported:
2814 : 15973 : do_peeling = false;
2815 : 15973 : break;
2816 : : }
2817 : :
2818 : : /* Check if all datarefs are supportable and log. */
2819 : 80874 : if (do_peeling
2820 : 80874 : && npeel == 0
2821 : 80874 : && known_alignment_for_access_p (dr0_info,
2822 : : STMT_VINFO_VECTYPE (stmt_info)))
2823 : 3 : return opt_result::success ();
2824 : :
2825 : : /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2826 : 80871 : if (do_peeling)
2827 : : {
2828 : 64898 : unsigned max_allowed_peel
2829 : 64898 : = param_vect_max_peeling_for_alignment;
2830 : 64898 : if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2831 : : max_allowed_peel = 0;
2832 : 14542 : if (max_allowed_peel != (unsigned)-1)
2833 : : {
2834 : 50375 : unsigned max_peel = npeel;
2835 : 50375 : if (max_peel == 0)
2836 : : {
2837 : 47618 : poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2838 : 47618 : unsigned HOST_WIDE_INT target_align_c;
2839 : 47618 : if (target_align.is_constant (&target_align_c))
2840 : 95236 : max_peel =
2841 : 47618 : target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2842 : : else
2843 : : {
2844 : : do_peeling = false;
2845 : : if (dump_enabled_p ())
2846 : : dump_printf_loc (MSG_NOTE, vect_location,
2847 : : "Disable peeling, max peels set and vector"
2848 : : " alignment unknown\n");
2849 : : }
2850 : : }
2851 : 50375 : if (max_peel > max_allowed_peel)
2852 : : {
2853 : 50367 : do_peeling = false;
2854 : 50367 : if (dump_enabled_p ())
2855 : 51 : dump_printf_loc (MSG_NOTE, vect_location,
2856 : : "Disable peeling, max peels reached: %d\n", max_peel);
2857 : : }
2858 : : }
2859 : : }
2860 : :
2861 : : /* Cost model #2 - if peeling may result in a remaining loop not
2862 : : iterating enough to be vectorized then do not peel. Since this
2863 : : is a cost heuristic rather than a correctness decision, use the
2864 : : most likely runtime value for variable vectorization factors. */
2865 : 51 : if (do_peeling
2866 : 14531 : && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2867 : : {
2868 : 3948 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2869 : 3948 : unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2870 : 3948 : if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2871 : 3948 : < assumed_vf + max_peel)
2872 : : do_peeling = false;
2873 : : }
2874 : :
2875 : : if (do_peeling)
2876 : : {
2877 : : /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2878 : : If the misalignment of DR_i is identical to that of dr0 then set
2879 : : DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2880 : : dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2881 : : by the peeling factor times the element size of DR_i (MOD the
2882 : : vectorization factor times the size). Otherwise, the
2883 : : misalignment of DR_i must be set to unknown. */
2884 : 31044 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2885 : 17323 : if (dr != dr0_info->dr)
2886 : : {
2887 : 3602 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2888 : 3602 : if (!vect_relevant_for_alignment_p (dr_info))
2889 : 363 : continue;
2890 : :
2891 : 3239 : vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2892 : : }
2893 : : }
2894 : :
2895 : 80871 : if (do_peeling && !try_peeling_with_versioning)
2896 : : {
2897 : : /* Update data structures if peeling will be applied alone. */
2898 : 12225 : LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2899 : 12225 : if (npeel)
2900 : 2172 : LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2901 : : else
2902 : 10053 : LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2903 : 12225 : SET_DR_MISALIGNMENT (dr0_info,
2904 : : vect_dr_misalign_for_aligned_access (dr0_info));
2905 : 12225 : if (dump_enabled_p ())
2906 : : {
2907 : 328 : dump_printf_loc (MSG_NOTE, vect_location,
2908 : : "Alignment of access forced using peeling.\n");
2909 : 328 : dump_printf_loc (MSG_NOTE, vect_location,
2910 : : "Peeling for alignment will be applied.\n");
2911 : : }
2912 : :
2913 : : /* The inside-loop cost will be accounted for in vectorizable_load
2914 : : and vectorizable_store correctly with adjusted alignments.
2915 : : Drop the body_cst_vec on the floor here. */
2916 : 12225 : return opt_result::success ();
2917 : : }
2918 : : }
2919 : :
2920 : : /* (2) Versioning to force alignment. */
2921 : :
2922 : : /* Try versioning if:
2923 : : 1) optimize loop for speed and the cost-model is not cheap
2924 : : 2) there is at least one unsupported misaligned data ref with an unknown
2925 : : misalignment, and
2926 : : 3) all misaligned data refs with a known misalignment are supported, and
2927 : : 4) the number of runtime alignment checks is within reason. */
2928 : :
2929 : 308385 : do_versioning
2930 : 308385 : = (optimize_loop_nest_for_speed_p (loop)
2931 : 307957 : && !loop->inner /* FORNOW */
2932 : 614948 : && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2933 : :
2934 : : if (do_versioning)
2935 : : {
2936 : 300824 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2937 : : {
2938 : 224921 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2939 : 224921 : if (!vect_relevant_for_alignment_p (dr_info))
2940 : 162196 : continue;
2941 : :
2942 : 153590 : stmt_vec_info stmt_info = dr_info->stmt;
2943 : 153590 : if (STMT_VINFO_STRIDED_P (stmt_info))
2944 : : {
2945 : : do_versioning = false;
2946 : 5192 : break;
2947 : : }
2948 : :
2949 : 152716 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2950 : 152716 : bool negative = tree_int_cst_compare (DR_STEP (dr),
2951 : 152716 : size_zero_node) < 0;
2952 : 152716 : poly_int64 off = 0;
2953 : 152716 : if (negative)
2954 : 3065 : off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2955 : 3065 : * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2956 : 152716 : int misalignment;
2957 : 152716 : if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2958 : 90865 : continue;
2959 : :
2960 : 61851 : enum dr_alignment_support supportable_dr_alignment
2961 : 61851 : = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2962 : : misalignment);
2963 : 61851 : if (supportable_dr_alignment == dr_unaligned_unsupported)
2964 : : {
2965 : 17131 : if (misalignment != DR_MISALIGNMENT_UNKNOWN
2966 : 17131 : || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2967 : 13421 : >= (unsigned) param_vect_max_version_for_alignment_checks))
2968 : : {
2969 : : do_versioning = false;
2970 : 5192 : break;
2971 : : }
2972 : :
2973 : : /* Forcing alignment in the first iteration is no good if
2974 : : we don't keep it across iterations. For now, just disable
2975 : : versioning in this case.
2976 : : ?? We could actually unroll the loop to achieve the required
2977 : : overall step alignment, and forcing the alignment could be
2978 : : done by doing some iterations of the non-vectorized loop. */
2979 : 12941 : if (!multiple_p (vf * DR_STEP_ALIGNMENT (dr),
2980 : 12941 : DR_TARGET_ALIGNMENT (dr_info)))
2981 : : {
2982 : : do_versioning = false;
2983 : : break;
2984 : : }
2985 : :
2986 : : /* Use "mask = DR_TARGET_ALIGNMENT - 1" to test rightmost address
2987 : : bits for runtime alignment check. For example, for 16 bytes
2988 : : target alignment the mask is 15 = 0xf. */
2989 : 12941 : poly_uint64 mask = DR_TARGET_ALIGNMENT (dr_info) - 1;
2990 : :
2991 : : /* FORNOW: use the same mask to test all potentially unaligned
2992 : : references in the loop. */
2993 : 12941 : if (maybe_ne (LOOP_VINFO_PTR_MASK (loop_vinfo), 0U)
2994 : 12941 : && maybe_ne (LOOP_VINFO_PTR_MASK (loop_vinfo), mask))
2995 : : {
2996 : : do_versioning = false;
2997 : : break;
2998 : : }
2999 : :
3000 : 12813 : LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
3001 : 12813 : LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
3002 : : }
3003 : : }
3004 : :
3005 : : /* Versioning requires at least one misaligned data reference. */
3006 : 81095 : if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3007 : : do_versioning = false;
3008 : 6338 : else if (!do_versioning)
3009 : 632 : LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
3010 : : }
3011 : :
3012 : : /* If we are trying peeling with versioning but versioning is disabled for
3013 : : some reason, peeling should be turned off together. */
3014 : 308385 : if (try_peeling_with_versioning && !do_versioning)
3015 : : do_peeling = false;
3016 : :
3017 : 296015 : if (do_versioning)
3018 : : {
3019 : : const vec<stmt_vec_info> &may_misalign_stmts
3020 : : = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3021 : : stmt_vec_info stmt_info;
3022 : :
3023 : : /* It can now be assumed that the data references in the statements
3024 : : in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
3025 : : of the loop being vectorized. */
3026 : 15403 : FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3027 : : {
3028 : 9697 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
3029 : 9697 : SET_DR_MISALIGNMENT (dr_info,
3030 : : vect_dr_misalign_for_aligned_access (dr_info));
3031 : 9697 : if (dump_enabled_p ())
3032 : 141 : dump_printf_loc (MSG_NOTE, vect_location,
3033 : : "Alignment of access forced using versioning.\n");
3034 : : }
3035 : :
3036 : 5706 : if (do_peeling)
3037 : : {
3038 : : /* This point is reached if peeling and versioning are used together
3039 : : to ensure alignment. Update data structures to make sure the loop
3040 : : is correctly peeled and a right runtime check is added for loop
3041 : : versioning. */
3042 : 1496 : gcc_assert (try_peeling_with_versioning);
3043 : 1496 : LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
3044 : 1496 : LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
3045 : 1496 : LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (loop_vinfo) = true;
3046 : 1496 : if (dump_enabled_p ())
3047 : 10 : dump_printf_loc (MSG_NOTE, vect_location,
3048 : : "Both peeling and versioning will be applied.\n");
3049 : : }
3050 : : else
3051 : : {
3052 : : /* This point is reached if versioning is used alone. */
3053 : 4210 : LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (loop_vinfo) = false;
3054 : 4210 : if (dump_enabled_p ())
3055 : 74 : dump_printf_loc (MSG_NOTE, vect_location,
3056 : : "Versioning for alignment will be applied.\n");
3057 : : }
3058 : :
3059 : 5706 : return opt_result::success ();
3060 : : }
3061 : :
3062 : : /* This point is reached if neither peeling nor versioning is being done. */
3063 : 302679 : gcc_assert (! (do_peeling || do_versioning));
3064 : :
3065 : 302679 : return opt_result::success ();
3066 : 654477 : }
3067 : :
3068 : :
3069 : : /* Function vect_analyze_data_refs_alignment
3070 : :
3071 : : Analyze the alignment of the data-references in the loop.
3072 : : Return FALSE if a data reference is found that cannot be vectorized. */
3073 : :
3074 : : opt_result
3075 : 366447 : vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
3076 : : {
3077 : 366447 : DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
3078 : :
3079 : 366447 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3080 : 366447 : struct data_reference *dr;
3081 : 366447 : unsigned int i;
3082 : :
3083 : 366447 : vect_record_base_alignments (loop_vinfo);
3084 : 1197867 : FOR_EACH_VEC_ELT (datarefs, i, dr)
3085 : : {
3086 : 831420 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
3087 : 831420 : if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
3088 : : {
3089 : 831420 : if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
3090 : 1094151 : && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
3091 : 118906 : continue;
3092 : :
3093 : 712514 : vect_compute_data_ref_alignment (loop_vinfo, dr_info,
3094 : : STMT_VINFO_VECTYPE (dr_info->stmt));
3095 : : }
3096 : : }
3097 : :
3098 : 366447 : return opt_result::success ();
3099 : : }
3100 : :
3101 : :
3102 : : /* Analyze alignment of DRs of stmts in NODE. */
3103 : :
3104 : : static bool
3105 : 813512 : vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
3106 : : {
3107 : : /* Alignment is maintained in the first element of the group. */
3108 : 813512 : stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
3109 : 813512 : first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
3110 : 813512 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
3111 : 813512 : tree vectype = SLP_TREE_VECTYPE (node);
3112 : 813512 : poly_uint64 vector_alignment
3113 : 813512 : = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
3114 : : BITS_PER_UNIT);
3115 : 813512 : if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
3116 : 773547 : vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
3117 : : /* Re-analyze alignment when we're facing a vectorization with a bigger
3118 : : alignment requirement. */
3119 : 39965 : else if (known_lt (dr_info->target_alignment, vector_alignment))
3120 : : {
3121 : 67 : poly_uint64 old_target_alignment = dr_info->target_alignment;
3122 : 67 : int old_misalignment = dr_info->misalignment;
3123 : 67 : vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
3124 : : /* But keep knowledge about a smaller alignment. */
3125 : 67 : if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
3126 : 38 : && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
3127 : : {
3128 : 1 : dr_info->target_alignment = old_target_alignment;
3129 : 1 : dr_info->misalignment = old_misalignment;
3130 : : }
3131 : : }
3132 : : /* When we ever face unordered target alignments the first one wins in terms
3133 : : of analyzing and the other will become unknown in dr_misalignment. */
3134 : 813512 : return true;
3135 : : }
3136 : :
3137 : : /* Function vect_slp_analyze_instance_alignment
3138 : :
3139 : : Analyze the alignment of the data-references in the SLP instance.
3140 : : Return FALSE if a data reference is found that cannot be vectorized. */
3141 : :
3142 : : bool
3143 : 787782 : vect_slp_analyze_instance_alignment (vec_info *vinfo,
3144 : : slp_instance instance)
3145 : : {
3146 : 787782 : DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
3147 : :
3148 : 787782 : slp_tree node;
3149 : 787782 : unsigned i;
3150 : 945825 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
3151 : 158043 : if (! vect_slp_analyze_node_alignment (vinfo, node))
3152 : : return false;
3153 : :
3154 : 787782 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
3155 : 787782 : && ! vect_slp_analyze_node_alignment
3156 : 655469 : (vinfo, SLP_INSTANCE_TREE (instance)))
3157 : : return false;
3158 : :
3159 : : return true;
3160 : : }
3161 : :
3162 : :
3163 : : /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3164 : : accesses of legal size, step, etc. Detect gaps, single element
3165 : : interleaving, and other special cases. Set grouped access info.
3166 : : Collect groups of strided stores for further use in SLP analysis.
3167 : : Worker for vect_analyze_group_access. */
3168 : :
3169 : : static bool
3170 : 12624263 : vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
3171 : : {
3172 : 12624263 : data_reference *dr = dr_info->dr;
3173 : 12624263 : tree step = DR_STEP (dr);
3174 : 12624263 : tree scalar_type = TREE_TYPE (DR_REF (dr));
3175 : 12624263 : HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
3176 : 12624263 : stmt_vec_info stmt_info = dr_info->stmt;
3177 : 12624263 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3178 : 12624263 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
3179 : 12624263 : HOST_WIDE_INT dr_step = -1;
3180 : 12624263 : HOST_WIDE_INT groupsize, last_accessed_element = 1;
3181 : 12624263 : bool slp_impossible = false;
3182 : :
3183 : : /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
3184 : : size of the interleaving group (including gaps). */
3185 : 12624263 : if (tree_fits_shwi_p (step))
3186 : : {
3187 : 12616285 : dr_step = tree_to_shwi (step);
3188 : : /* Check that STEP is a multiple of type size. Otherwise there is
3189 : : a non-element-sized gap at the end of the group which we
3190 : : cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
3191 : : ??? As we can handle non-constant step fine here we should
3192 : : simply remove uses of DR_GROUP_GAP between the last and first
3193 : : element and instead rely on DR_STEP. DR_GROUP_SIZE then would
3194 : : simply not include that gap. */
3195 : 12616285 : if ((dr_step % type_size) != 0)
3196 : : {
3197 : 494 : if (dump_enabled_p ())
3198 : 27 : dump_printf_loc (MSG_NOTE, vect_location,
3199 : : "Step %T is not a multiple of the element size"
3200 : : " for %T\n",
3201 : : step, DR_REF (dr));
3202 : 494 : return false;
3203 : : }
3204 : 12615791 : groupsize = absu_hwi (dr_step) / type_size;
3205 : : }
3206 : : else
3207 : : groupsize = 0;
3208 : :
3209 : : /* Not consecutive access is possible only if it is a part of interleaving. */
3210 : 12623769 : if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
3211 : : {
3212 : : /* Check if it this DR is a part of interleaving, and is a single
3213 : : element of the group that is accessed in the loop. */
3214 : :
3215 : : /* Gaps are supported only for loads. STEP must be a multiple of the type
3216 : : size. */
3217 : 8544148 : if (DR_IS_READ (dr)
3218 : 5127621 : && (dr_step % type_size) == 0
3219 : : && groupsize > 0
3220 : : /* This could be UINT_MAX but as we are generating code in a very
3221 : : inefficient way we have to cap earlier.
3222 : : See PR91403 for example. */
3223 : 5127621 : && groupsize <= 4096)
3224 : : {
3225 : 60672 : DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
3226 : 60672 : DR_GROUP_SIZE (stmt_info) = groupsize;
3227 : 60672 : DR_GROUP_GAP (stmt_info) = groupsize - 1;
3228 : 60672 : if (dump_enabled_p ())
3229 : 1341 : dump_printf_loc (MSG_NOTE, vect_location,
3230 : : "Detected single element interleaving %T"
3231 : : " step %T\n",
3232 : : DR_REF (dr), step);
3233 : :
3234 : 60672 : return true;
3235 : : }
3236 : :
3237 : 8483476 : if (dump_enabled_p ())
3238 : 3093 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3239 : : "not consecutive access %G", stmt_info->stmt);
3240 : :
3241 : 8483476 : if (bb_vinfo)
3242 : : {
3243 : : /* Mark the statement as unvectorizable. */
3244 : 8464714 : STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3245 : 8464714 : return true;
3246 : : }
3247 : :
3248 : 18762 : if (dump_enabled_p ())
3249 : 294 : dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
3250 : 18762 : STMT_VINFO_STRIDED_P (stmt_info) = true;
3251 : 18762 : return true;
3252 : : }
3253 : :
3254 : 4079621 : if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3255 : : {
3256 : : /* First stmt in the interleaving chain. Check the chain. */
3257 : 1489518 : stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3258 : 1489518 : struct data_reference *data_ref = dr;
3259 : 1489518 : unsigned int count = 1;
3260 : 1489518 : tree prev_init = DR_INIT (data_ref);
3261 : 1489518 : HOST_WIDE_INT diff, gaps = 0;
3262 : :
3263 : : /* By construction, all group members have INTEGER_CST DR_INITs. */
3264 : 4079630 : while (next)
3265 : : {
3266 : : /* We never have the same DR multiple times. */
3267 : 2590174 : gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
3268 : : DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
3269 : :
3270 : 2590174 : data_ref = STMT_VINFO_DATA_REF (next);
3271 : :
3272 : : /* All group members have the same STEP by construction. */
3273 : 2590174 : gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3274 : :
3275 : : /* Check that the distance between two accesses is equal to the type
3276 : : size. Otherwise, we have gaps. */
3277 : 2590174 : diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3278 : 2590174 : - TREE_INT_CST_LOW (prev_init)) / type_size;
3279 : 2590174 : if (diff < 1 || diff > UINT_MAX)
3280 : : {
3281 : : /* For artificial testcases with array accesses with large
3282 : : constant indices we can run into overflow issues which
3283 : : can end up fooling the groupsize constraint below so
3284 : : check the individual gaps (which are represented as
3285 : : unsigned int) as well. */
3286 : 0 : if (dump_enabled_p ())
3287 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3288 : : "interleaved access with gap larger "
3289 : : "than representable\n");
3290 : 0 : return false;
3291 : : }
3292 : 2590174 : if (diff != 1)
3293 : : {
3294 : : /* FORNOW: SLP of accesses with gaps is not supported. */
3295 : 102056 : slp_impossible = true;
3296 : 102056 : if (DR_IS_WRITE (data_ref))
3297 : : {
3298 : 62 : if (dump_enabled_p ())
3299 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3300 : : "interleaved store with gaps\n");
3301 : 62 : return false;
3302 : : }
3303 : :
3304 : 101994 : gaps += diff - 1;
3305 : : }
3306 : :
3307 : 2590112 : last_accessed_element += diff;
3308 : :
3309 : : /* Store the gap from the previous member of the group. If there is no
3310 : : gap in the access, DR_GROUP_GAP is always 1. */
3311 : 2590112 : DR_GROUP_GAP (next) = diff;
3312 : :
3313 : 2590112 : prev_init = DR_INIT (data_ref);
3314 : 2590112 : next = DR_GROUP_NEXT_ELEMENT (next);
3315 : : /* Count the number of data-refs in the chain. */
3316 : 2590112 : count++;
3317 : : }
3318 : :
3319 : 1489456 : if (groupsize == 0)
3320 : 1426651 : groupsize = count + gaps;
3321 : :
3322 : : /* This could be UINT_MAX but as we are generating code in a very
3323 : : inefficient way we have to cap earlier. See PR78699 for example. */
3324 : 1489456 : if (groupsize > 4096)
3325 : : {
3326 : 1 : if (dump_enabled_p ())
3327 : 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3328 : : "group is too large\n");
3329 : 1 : return false;
3330 : : }
3331 : :
3332 : : /* Check that the size of the interleaving is equal to count for stores,
3333 : : i.e., that there are no gaps. */
3334 : 1489455 : if (groupsize != count
3335 : 109216 : && !DR_IS_READ (dr))
3336 : : {
3337 : 10649 : groupsize = count;
3338 : 10649 : STMT_VINFO_STRIDED_P (stmt_info) = true;
3339 : : }
3340 : :
3341 : : /* If there is a gap after the last load in the group it is the
3342 : : difference between the groupsize and the last accessed
3343 : : element.
3344 : : When there is no gap, this difference should be 0. */
3345 : 1489455 : DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3346 : :
3347 : 1489455 : DR_GROUP_SIZE (stmt_info) = groupsize;
3348 : 1489455 : if (dump_enabled_p ())
3349 : : {
3350 : 7726 : dump_printf_loc (MSG_NOTE, vect_location,
3351 : : "Detected interleaving ");
3352 : 7726 : if (DR_IS_READ (dr))
3353 : 4150 : dump_printf (MSG_NOTE, "load ");
3354 : 3576 : else if (STMT_VINFO_STRIDED_P (stmt_info))
3355 : 468 : dump_printf (MSG_NOTE, "strided store ");
3356 : : else
3357 : 3108 : dump_printf (MSG_NOTE, "store ");
3358 : 7726 : dump_printf (MSG_NOTE, "of size %u\n",
3359 : : (unsigned)groupsize);
3360 : 7726 : dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3361 : 7726 : next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3362 : 37881 : while (next)
3363 : : {
3364 : 30155 : if (DR_GROUP_GAP (next) != 1)
3365 : 271 : dump_printf_loc (MSG_NOTE, vect_location,
3366 : : "\t<gap of %d elements>\n",
3367 : 271 : DR_GROUP_GAP (next) - 1);
3368 : 30155 : dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3369 : 30155 : next = DR_GROUP_NEXT_ELEMENT (next);
3370 : : }
3371 : 7726 : if (DR_GROUP_GAP (stmt_info) != 0)
3372 : 345 : dump_printf_loc (MSG_NOTE, vect_location,
3373 : : "\t<gap of %d elements>\n",
3374 : 345 : DR_GROUP_GAP (stmt_info));
3375 : : }
3376 : :
3377 : : /* SLP: create an SLP data structure for every interleaving group of
3378 : : stores for further analysis in vect_analyse_slp. */
3379 : 1489455 : if (DR_IS_WRITE (dr) && !slp_impossible)
3380 : : {
3381 : 909478 : if (loop_vinfo)
3382 : 23779 : LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3383 : 909478 : if (bb_vinfo)
3384 : 885699 : BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3385 : : }
3386 : : }
3387 : :
3388 : : return true;
3389 : : }
3390 : :
3391 : : /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3392 : : accesses of legal size, step, etc. Detect gaps, single element
3393 : : interleaving, and other special cases. Set grouped access info.
3394 : : Collect groups of strided stores for further use in SLP analysis. */
3395 : :
3396 : : static bool
3397 : 12624263 : vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3398 : : {
3399 : 12624263 : if (!vect_analyze_group_access_1 (vinfo, dr_info))
3400 : : {
3401 : : /* Dissolve the group if present. */
3402 : 557 : stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3403 : 1596 : while (stmt_info)
3404 : : {
3405 : 1039 : stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3406 : 1039 : DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3407 : 1039 : DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3408 : 1039 : stmt_info = next;
3409 : : }
3410 : : return false;
3411 : : }
3412 : : return true;
3413 : : }
3414 : :
3415 : : /* Analyze the access pattern of the data-reference DR_INFO.
3416 : : In case of non-consecutive accesses call vect_analyze_group_access() to
3417 : : analyze groups of accesses. */
3418 : :
3419 : : static bool
3420 : 13253929 : vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3421 : : {
3422 : 13253929 : data_reference *dr = dr_info->dr;
3423 : 13253929 : tree step = DR_STEP (dr);
3424 : 13253929 : tree scalar_type = TREE_TYPE (DR_REF (dr));
3425 : 13253929 : stmt_vec_info stmt_info = dr_info->stmt;
3426 : 13253929 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3427 : 13253929 : class loop *loop = NULL;
3428 : :
3429 : 13253929 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3430 : : return true;
3431 : :
3432 : 13148407 : if (loop_vinfo)
3433 : 790428 : loop = LOOP_VINFO_LOOP (loop_vinfo);
3434 : :
3435 : 13148407 : if (loop_vinfo && !step)
3436 : : {
3437 : 0 : if (dump_enabled_p ())
3438 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3439 : : "bad data-ref access in loop\n");
3440 : 0 : return false;
3441 : : }
3442 : :
3443 : : /* Allow loads with zero step in inner-loop vectorization. */
3444 : 13148407 : if (loop_vinfo && integer_zerop (step))
3445 : : {
3446 : 14253 : DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3447 : 14253 : DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3448 : 14253 : if (!nested_in_vect_loop_p (loop, stmt_info))
3449 : 13992 : return DR_IS_READ (dr);
3450 : : /* Allow references with zero step for outer loops marked
3451 : : with pragma omp simd only - it guarantees absence of
3452 : : loop-carried dependencies between inner loop iterations. */
3453 : 261 : if (loop->safelen < 2)
3454 : : {
3455 : 225 : if (dump_enabled_p ())
3456 : 6 : dump_printf_loc (MSG_NOTE, vect_location,
3457 : : "zero step in inner loop of nest\n");
3458 : 225 : return false;
3459 : : }
3460 : : }
3461 : :
3462 : 13134154 : if (loop && nested_in_vect_loop_p (loop, stmt_info))
3463 : : {
3464 : : /* Interleaved accesses are not yet supported within outer-loop
3465 : : vectorization for references in the inner-loop. */
3466 : 5594 : DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3467 : 5594 : DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3468 : :
3469 : : /* For the rest of the analysis we use the outer-loop step. */
3470 : 5594 : step = STMT_VINFO_DR_STEP (stmt_info);
3471 : 5594 : if (integer_zerop (step))
3472 : : {
3473 : 1220 : if (dump_enabled_p ())
3474 : 228 : dump_printf_loc (MSG_NOTE, vect_location,
3475 : : "zero step in outer loop.\n");
3476 : 1220 : return DR_IS_READ (dr);
3477 : : }
3478 : : }
3479 : :
3480 : : /* Consecutive? */
3481 : 13132970 : if (TREE_CODE (step) == INTEGER_CST)
3482 : : {
3483 : 13096541 : HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3484 : 13096541 : if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3485 : 13096541 : || (dr_step < 0
3486 : 27090 : && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3487 : : {
3488 : : /* Mark that it is not interleaving. */
3489 : 477404 : DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3490 : 477404 : DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3491 : 477404 : return true;
3492 : : }
3493 : : }
3494 : :
3495 : 12655566 : if (loop && nested_in_vect_loop_p (loop, stmt_info))
3496 : : {
3497 : 3282 : if (dump_enabled_p ())
3498 : 146 : dump_printf_loc (MSG_NOTE, vect_location,
3499 : : "grouped access in outer loop.\n");
3500 : 3282 : return false;
3501 : : }
3502 : :
3503 : :
3504 : : /* Assume this is a DR handled by non-constant strided load case. */
3505 : 12652284 : if (TREE_CODE (step) != INTEGER_CST)
3506 : 35999 : return (STMT_VINFO_STRIDED_P (stmt_info)
3507 : 35999 : && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3508 : 7978 : || vect_analyze_group_access (vinfo, dr_info)));
3509 : :
3510 : : /* Not consecutive access - check if it's a part of interleaving group. */
3511 : 12616285 : return vect_analyze_group_access (vinfo, dr_info);
3512 : : }
3513 : :
3514 : : /* Compare two data-references DRA and DRB to group them into chunks
3515 : : suitable for grouping. */
3516 : :
3517 : : static int
3518 : 348644776 : dr_group_sort_cmp (const void *dra_, const void *drb_)
3519 : : {
3520 : 348644776 : dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3521 : 348644776 : dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3522 : 348644776 : data_reference_p dra = dra_info->dr;
3523 : 348644776 : data_reference_p drb = drb_info->dr;
3524 : 348644776 : int cmp;
3525 : :
3526 : : /* Stabilize sort. */
3527 : 348644776 : if (dra == drb)
3528 : : return 0;
3529 : :
3530 : : /* Different group IDs lead never belong to the same group. */
3531 : 348644776 : if (dra_info->group != drb_info->group)
3532 : 384053865 : return dra_info->group < drb_info->group ? -1 : 1;
3533 : :
3534 : : /* Ordering of DRs according to base. */
3535 : 95956492 : cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3536 : : DR_BASE_ADDRESS (drb));
3537 : 95956492 : if (cmp != 0)
3538 : : return cmp;
3539 : :
3540 : : /* And according to DR_OFFSET. */
3541 : 52033242 : cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3542 : 52033242 : if (cmp != 0)
3543 : : return cmp;
3544 : :
3545 : : /* Put reads before writes. */
3546 : 51679628 : if (DR_IS_READ (dra) != DR_IS_READ (drb))
3547 : 4120820 : return DR_IS_READ (dra) ? -1 : 1;
3548 : :
3549 : : /* Then sort after access size. */
3550 : 48874476 : cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3551 : 48874476 : TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3552 : 48874476 : if (cmp != 0)
3553 : : return cmp;
3554 : :
3555 : : /* And after step. */
3556 : 42309365 : cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3557 : 42309365 : if (cmp != 0)
3558 : : return cmp;
3559 : :
3560 : : /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3561 : 42303230 : cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3562 : 42303230 : if (cmp == 0)
3563 : 500296 : return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3564 : : return cmp;
3565 : : }
3566 : :
3567 : : /* If OP is the result of a conversion, return the unconverted value,
3568 : : otherwise return null. */
3569 : :
3570 : : static tree
3571 : 359 : strip_conversion (tree op)
3572 : : {
3573 : 359 : if (TREE_CODE (op) != SSA_NAME)
3574 : : return NULL_TREE;
3575 : 359 : gimple *stmt = SSA_NAME_DEF_STMT (op);
3576 : 359 : if (!is_gimple_assign (stmt)
3577 : 359 : || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3578 : : return NULL_TREE;
3579 : 182 : return gimple_assign_rhs1 (stmt);
3580 : : }
3581 : :
3582 : : /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3583 : : and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3584 : : be grouped in SLP mode. */
3585 : :
3586 : : static bool
3587 : 6939339 : can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3588 : : bool allow_slp_p)
3589 : : {
3590 : 6939339 : if (gimple_assign_single_p (stmt1_info->stmt))
3591 : 6937736 : return gimple_assign_single_p (stmt2_info->stmt);
3592 : :
3593 : 1603 : gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3594 : 1603 : if (call1 && gimple_call_internal_p (call1))
3595 : : {
3596 : : /* Check for two masked loads or two masked stores. */
3597 : 1815 : gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3598 : 1592 : if (!call2 || !gimple_call_internal_p (call2))
3599 : : return false;
3600 : 1592 : internal_fn ifn = gimple_call_internal_fn (call1);
3601 : 1592 : if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3602 : : return false;
3603 : 1592 : if (ifn != gimple_call_internal_fn (call2))
3604 : : return false;
3605 : :
3606 : : /* Check that the masks are the same. Cope with casts of masks,
3607 : : like those created by build_mask_conversion. */
3608 : 1592 : tree mask1 = gimple_call_arg (call1, 2);
3609 : 1592 : tree mask2 = gimple_call_arg (call2, 2);
3610 : 1592 : if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3611 : : {
3612 : 268 : mask1 = strip_conversion (mask1);
3613 : 268 : if (!mask1)
3614 : : return false;
3615 : 91 : mask2 = strip_conversion (mask2);
3616 : 91 : if (!mask2)
3617 : : return false;
3618 : 91 : if (!operand_equal_p (mask1, mask2, 0))
3619 : : return false;
3620 : : }
3621 : 1380 : return true;
3622 : : }
3623 : :
3624 : : return false;
3625 : : }
3626 : :
3627 : : /* Function vect_analyze_data_ref_accesses.
3628 : :
3629 : : Analyze the access pattern of all the data references in the loop.
3630 : :
3631 : : FORNOW: the only access pattern that is considered vectorizable is a
3632 : : simple step 1 (consecutive) access.
3633 : :
3634 : : FORNOW: handle only arrays and pointer accesses. */
3635 : :
3636 : : opt_result
3637 : 2597285 : vect_analyze_data_ref_accesses (vec_info *vinfo,
3638 : : vec<int> *dataref_groups)
3639 : : {
3640 : 2597285 : unsigned int i;
3641 : 2597285 : vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3642 : :
3643 : 2597285 : DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3644 : :
3645 : 2597285 : if (datarefs.is_empty ())
3646 : 1067715 : return opt_result::success ();
3647 : :
3648 : : /* Sort the array of datarefs to make building the interleaving chains
3649 : : linear. Don't modify the original vector's order, it is needed for
3650 : : determining what dependencies are reversed. */
3651 : 1529570 : vec<dr_vec_info *> datarefs_copy;
3652 : 1529570 : datarefs_copy.create (datarefs.length ());
3653 : 16576762 : for (unsigned i = 0; i < datarefs.length (); i++)
3654 : : {
3655 : 15047192 : dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3656 : : /* If the caller computed DR grouping use that, otherwise group by
3657 : : basic blocks. */
3658 : 15047192 : if (dataref_groups)
3659 : 14137441 : dr_info->group = (*dataref_groups)[i];
3660 : : else
3661 : 909751 : dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3662 : 15047192 : datarefs_copy.quick_push (dr_info);
3663 : : }
3664 : 1529570 : datarefs_copy.qsort (dr_group_sort_cmp);
3665 : 1529570 : hash_set<stmt_vec_info> to_fixup;
3666 : :
3667 : : /* Build the interleaving chains. */
3668 : 14193300 : for (i = 0; i < datarefs_copy.length () - 1;)
3669 : : {
3670 : 11134160 : dr_vec_info *dr_info_a = datarefs_copy[i];
3671 : 11134160 : data_reference_p dra = dr_info_a->dr;
3672 : 11134160 : int dra_group_id = dr_info_a->group;
3673 : 11134160 : stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3674 : 11134160 : stmt_vec_info lastinfo = NULL;
3675 : 11134160 : if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3676 : 9492886 : || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3677 : : {
3678 : 1709401 : ++i;
3679 : 1709401 : continue;
3680 : : }
3681 : 24713742 : for (i = i + 1; i < datarefs_copy.length (); ++i)
3682 : : {
3683 : 11808221 : dr_vec_info *dr_info_b = datarefs_copy[i];
3684 : 11808221 : data_reference_p drb = dr_info_b->dr;
3685 : 11808221 : int drb_group_id = dr_info_b->group;
3686 : 11808221 : stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3687 : 11808221 : if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3688 : 11498452 : || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3689 : : break;
3690 : :
3691 : : /* ??? Imperfect sorting (non-compatible types, non-modulo
3692 : : accesses, same accesses) can lead to a group to be artificially
3693 : : split here as we don't just skip over those. If it really
3694 : : matters we can push those to a worklist and re-iterate
3695 : : over them. The we can just skip ahead to the next DR here. */
3696 : :
3697 : : /* DRs in a different DR group should not be put into the same
3698 : : interleaving group. */
3699 : 11494823 : if (dra_group_id != drb_group_id)
3700 : : break;
3701 : :
3702 : : /* Check that the data-refs have same first location (except init)
3703 : : and they are both either store or load (not load and store,
3704 : : not masked loads or stores). */
3705 : 7197304 : if (DR_IS_READ (dra) != DR_IS_READ (drb)
3706 : 5924608 : || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3707 : : DR_BASE_ADDRESS (drb)) != 0
3708 : 4334738 : || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3709 : 11511767 : || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3710 : : break;
3711 : :
3712 : : /* Check that the data-refs have the same constant size. */
3713 : 4314446 : tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3714 : 4314446 : tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3715 : 4314446 : if (!tree_fits_uhwi_p (sza)
3716 : 4314446 : || !tree_fits_uhwi_p (szb)
3717 : 8628892 : || !tree_int_cst_equal (sza, szb))
3718 : : break;
3719 : :
3720 : : /* Check that the data-refs have the same step. */
3721 : 3967491 : if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3722 : : break;
3723 : :
3724 : : /* Check the types are compatible.
3725 : : ??? We don't distinguish this during sorting. */
3726 : 3966880 : if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3727 : 3966880 : TREE_TYPE (DR_REF (drb))))
3728 : : break;
3729 : :
3730 : : /* Check that the DR_INITs are compile-time constants. */
3731 : 2835859 : if (!tree_fits_shwi_p (DR_INIT (dra))
3732 : 2835859 : || !tree_fits_shwi_p (DR_INIT (drb)))
3733 : : break;
3734 : :
3735 : : /* Different .GOMP_SIMD_LANE calls still give the same lane,
3736 : : just hold extra information. */
3737 : 2835859 : if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3738 : 1240 : && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3739 : 2837099 : && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3740 : : break;
3741 : :
3742 : : /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3743 : 2834619 : HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3744 : 2834619 : HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3745 : 2834619 : HOST_WIDE_INT init_prev
3746 : 2834619 : = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3747 : 2834619 : gcc_assert (init_a <= init_b
3748 : : && init_a <= init_prev
3749 : : && init_prev <= init_b);
3750 : :
3751 : : /* Do not place the same access in the interleaving chain twice. */
3752 : 2834619 : if (init_b == init_prev)
3753 : : {
3754 : 30222 : gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3755 : : < gimple_uid (DR_STMT (drb)));
3756 : : /* Simply link in duplicates and fix up the chain below. */
3757 : : }
3758 : : else
3759 : : {
3760 : : /* If init_b == init_a + the size of the type * k, we have an
3761 : : interleaving, and DRA is accessed before DRB. */
3762 : 2804397 : unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3763 : 2804397 : if (type_size_a == 0
3764 : 2804397 : || (((unsigned HOST_WIDE_INT)init_b - init_a)
3765 : 2804397 : % type_size_a != 0))
3766 : : break;
3767 : :
3768 : : /* If we have a store, the accesses are adjacent. This splits
3769 : : groups into chunks we support (we don't support vectorization
3770 : : of stores with gaps). */
3771 : 2802597 : if (!DR_IS_READ (dra)
3772 : 1836189 : && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3773 : : != type_size_a))
3774 : : break;
3775 : :
3776 : : /* For datarefs with big gap, it's better to split them into different
3777 : : groups.
3778 : : .i.e a[0], a[1], a[2], .. a[7], a[100], a[101],..., a[107] */
3779 : 2621561 : if ((unsigned HOST_WIDE_INT)(init_b - init_prev)
3780 : : > MAX_BITSIZE_MODE_ANY_MODE / BITS_PER_UNIT)
3781 : : break;
3782 : :
3783 : : /* If the step (if not zero or non-constant) is smaller than the
3784 : : difference between data-refs' inits this splits groups into
3785 : : suitable sizes. */
3786 : 2612563 : if (tree_fits_shwi_p (DR_STEP (dra)))
3787 : : {
3788 : 2607474 : unsigned HOST_WIDE_INT step
3789 : 2607474 : = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3790 : 2607474 : if (step != 0
3791 : 137508 : && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3792 : : break;
3793 : : }
3794 : : }
3795 : :
3796 : 2625253 : if (dump_enabled_p ())
3797 : 30971 : dump_printf_loc (MSG_NOTE, vect_location,
3798 : 30971 : DR_IS_READ (dra)
3799 : : ? "Detected interleaving load %T and %T\n"
3800 : : : "Detected interleaving store %T and %T\n",
3801 : : DR_REF (dra), DR_REF (drb));
3802 : :
3803 : : /* Link the found element into the group list. */
3804 : 2625253 : if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3805 : : {
3806 : 1467468 : DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3807 : 1467468 : lastinfo = stmtinfo_a;
3808 : : }
3809 : 2625253 : DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3810 : 2625253 : DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3811 : 2625253 : lastinfo = stmtinfo_b;
3812 : :
3813 : 2625253 : if (! STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3814 : : {
3815 : 2624876 : STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3816 : 2624876 : = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3817 : :
3818 : 2624876 : if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3819 : 126 : dump_printf_loc (MSG_NOTE, vect_location,
3820 : : "Load suitable for SLP vectorization only.\n");
3821 : : }
3822 : :
3823 : 2625253 : if (init_b == init_prev
3824 : 30222 : && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3825 : 2643102 : && dump_enabled_p ())
3826 : 218 : dump_printf_loc (MSG_NOTE, vect_location,
3827 : : "Queuing group with duplicate access for fixup\n");
3828 : : }
3829 : : }
3830 : :
3831 : : /* Fixup groups with duplicate entries by splitting it. */
3832 : 1574835 : while (1)
3833 : : {
3834 : 1574835 : hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3835 : 1574835 : if (!(it != to_fixup.end ()))
3836 : : break;
3837 : 45265 : stmt_vec_info grp = *it;
3838 : 45265 : to_fixup.remove (grp);
3839 : :
3840 : : /* Find the earliest duplicate group member. */
3841 : 45265 : unsigned first_duplicate = -1u;
3842 : 45265 : stmt_vec_info next, g = grp;
3843 : 277195 : while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3844 : : {
3845 : 186665 : if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3846 : 186665 : DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3847 : 186665 : && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3848 : : first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3849 : : g = next;
3850 : : }
3851 : 45265 : if (first_duplicate == -1U)
3852 : 17849 : continue;
3853 : :
3854 : : /* Then move all stmts after the first duplicate to a new group.
3855 : : Note this is a heuristic but one with the property that *it
3856 : : is fixed up completely. */
3857 : 27416 : g = grp;
3858 : 27416 : stmt_vec_info newgroup = NULL, ng = grp;
3859 : 239272 : while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3860 : : {
3861 : 184440 : if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3862 : : {
3863 : 178760 : DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3864 : 178760 : if (!newgroup)
3865 : : {
3866 : 27416 : newgroup = next;
3867 : 27416 : STMT_VINFO_SLP_VECT_ONLY (newgroup)
3868 : 27416 : = STMT_VINFO_SLP_VECT_ONLY (grp);
3869 : : }
3870 : : else
3871 : 151344 : DR_GROUP_NEXT_ELEMENT (ng) = next;
3872 : 178760 : ng = next;
3873 : 178760 : DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3874 : : }
3875 : : else
3876 : : g = DR_GROUP_NEXT_ELEMENT (g);
3877 : : }
3878 : 27416 : DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3879 : :
3880 : : /* Fixup the new group which still may contain duplicates. */
3881 : 27416 : to_fixup.add (newgroup);
3882 : : }
3883 : :
3884 : 1529570 : dr_vec_info *dr_info;
3885 : 16555038 : FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3886 : : {
3887 : 15033391 : if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3888 : 15033391 : && !vect_analyze_data_ref_access (vinfo, dr_info))
3889 : : {
3890 : 7977 : if (dump_enabled_p ())
3891 : 265 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3892 : : "not vectorized: complicated access pattern.\n");
3893 : :
3894 : 7977 : if (is_a <bb_vec_info> (vinfo))
3895 : : {
3896 : : /* Mark the statement as not vectorizable. */
3897 : 54 : STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3898 : 54 : continue;
3899 : : }
3900 : : else
3901 : : {
3902 : 7923 : datarefs_copy.release ();
3903 : 7923 : return opt_result::failure_at (dr_info->stmt->stmt,
3904 : : "not vectorized:"
3905 : : " complicated access pattern.\n");
3906 : : }
3907 : : }
3908 : : }
3909 : :
3910 : 1521647 : datarefs_copy.release ();
3911 : 1521647 : return opt_result::success ();
3912 : 1529570 : }
3913 : :
3914 : : /* Function vect_vfa_segment_size.
3915 : :
3916 : : Input:
3917 : : DR_INFO: The data reference.
3918 : : LENGTH_FACTOR: segment length to consider.
3919 : :
3920 : : Return a value suitable for the dr_with_seg_len::seg_len field.
3921 : : This is the "distance travelled" by the pointer from the first
3922 : : iteration in the segment to the last. Note that it does not include
3923 : : the size of the access; in effect it only describes the first byte. */
3924 : :
3925 : : static tree
3926 : 124346 : vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3927 : : {
3928 : 124346 : length_factor = size_binop (MINUS_EXPR,
3929 : : fold_convert (sizetype, length_factor),
3930 : : size_one_node);
3931 : 124346 : return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3932 : : length_factor);
3933 : : }
3934 : :
3935 : : /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3936 : : gives the worst-case number of bytes covered by the segment. */
3937 : :
3938 : : static unsigned HOST_WIDE_INT
3939 : 124828 : vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3940 : : {
3941 : 124828 : stmt_vec_info stmt_vinfo = dr_info->stmt;
3942 : 124828 : tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3943 : 124828 : unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3944 : 124828 : unsigned HOST_WIDE_INT access_size = ref_size;
3945 : 124828 : if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3946 : : {
3947 : 38458 : gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3948 : 38458 : access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3949 : : }
3950 : 124828 : tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3951 : 124828 : int misalignment;
3952 : 249656 : if (((misalignment = dr_misalignment (dr_info, vectype)), true)
3953 : 124828 : && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3954 : : == dr_explicit_realign_optimized))
3955 : : {
3956 : : /* We might access a full vector's worth. */
3957 : 0 : access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3958 : : }
3959 : 124828 : return access_size;
3960 : : }
3961 : :
3962 : : /* Get the minimum alignment for all the scalar accesses that DR_INFO
3963 : : describes. */
3964 : :
3965 : : static unsigned int
3966 : 124828 : vect_vfa_align (dr_vec_info *dr_info)
3967 : : {
3968 : 0 : return dr_alignment (dr_info->dr);
3969 : : }
3970 : :
3971 : : /* Function vect_no_alias_p.
3972 : :
3973 : : Given data references A and B with equal base and offset, see whether
3974 : : the alias relation can be decided at compilation time. Return 1 if
3975 : : it can and the references alias, 0 if it can and the references do
3976 : : not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3977 : : SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3978 : : of dr_with_seg_len::{seg_len,access_size} for A and B. */
3979 : :
3980 : : static int
3981 : 4345 : vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3982 : : tree segment_length_a, tree segment_length_b,
3983 : : unsigned HOST_WIDE_INT access_size_a,
3984 : : unsigned HOST_WIDE_INT access_size_b)
3985 : : {
3986 : 4345 : poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3987 : 4345 : poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3988 : 4345 : poly_uint64 const_length_a;
3989 : 4345 : poly_uint64 const_length_b;
3990 : :
3991 : : /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3992 : : bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3993 : : [a, a+12) */
3994 : 4345 : if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3995 : : {
3996 : 243 : const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3997 : 243 : offset_a -= const_length_a;
3998 : : }
3999 : : else
4000 : 4102 : const_length_a = tree_to_poly_uint64 (segment_length_a);
4001 : 4345 : if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
4002 : : {
4003 : 395 : const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
4004 : 395 : offset_b -= const_length_b;
4005 : : }
4006 : : else
4007 : 3950 : const_length_b = tree_to_poly_uint64 (segment_length_b);
4008 : :
4009 : 4345 : const_length_a += access_size_a;
4010 : 4345 : const_length_b += access_size_b;
4011 : :
4012 : 4345 : if (ranges_known_overlap_p (offset_a, const_length_a,
4013 : : offset_b, const_length_b))
4014 : : return 1;
4015 : :
4016 : 459 : if (!ranges_maybe_overlap_p (offset_a, const_length_a,
4017 : : offset_b, const_length_b))
4018 : 459 : return 0;
4019 : :
4020 : : return -1;
4021 : : }
4022 : :
4023 : : /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
4024 : : in DDR is >= VF. */
4025 : :
4026 : : static bool
4027 : 72803 : dependence_distance_ge_vf (data_dependence_relation *ddr,
4028 : : unsigned int loop_depth, poly_uint64 vf)
4029 : : {
4030 : 72803 : if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
4031 : 77758 : || DDR_NUM_DIST_VECTS (ddr) == 0)
4032 : : return false;
4033 : :
4034 : : /* If the dependence is exact, we should have limited the VF instead. */
4035 : 4986 : gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
4036 : :
4037 : : unsigned int i;
4038 : : lambda_vector dist_v;
4039 : 9999 : FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
4040 : : {
4041 : 9968 : HOST_WIDE_INT dist = dist_v[loop_depth];
4042 : 9968 : if (dist != 0
4043 : 4986 : && !(dist > 0 && DDR_REVERSED_P (ddr))
4044 : 14954 : && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
4045 : : return false;
4046 : : }
4047 : :
4048 : 31 : if (dump_enabled_p ())
4049 : 2 : dump_printf_loc (MSG_NOTE, vect_location,
4050 : : "dependence distance between %T and %T is >= VF\n",
4051 : 2 : DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
4052 : :
4053 : : return true;
4054 : : }
4055 : :
4056 : : /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
4057 : :
4058 : : static void
4059 : 461 : dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
4060 : : {
4061 : 461 : dump_printf (dump_kind, "%s (%T) >= ",
4062 : 461 : lower_bound.unsigned_p ? "unsigned" : "abs",
4063 : 461 : lower_bound.expr);
4064 : 461 : dump_dec (dump_kind, lower_bound.min_value);
4065 : 461 : }
4066 : :
4067 : : /* Record that the vectorized loop requires the vec_lower_bound described
4068 : : by EXPR, UNSIGNED_P and MIN_VALUE. */
4069 : :
4070 : : static void
4071 : 6595 : vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
4072 : : poly_uint64 min_value)
4073 : : {
4074 : 6595 : vec<vec_lower_bound> &lower_bounds
4075 : : = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
4076 : 7567 : for (unsigned int i = 0; i < lower_bounds.length (); ++i)
4077 : 5827 : if (operand_equal_p (lower_bounds[i].expr, expr, 0))
4078 : : {
4079 : 4855 : unsigned_p &= lower_bounds[i].unsigned_p;
4080 : 4855 : min_value = upper_bound (lower_bounds[i].min_value, min_value);
4081 : 4855 : if (lower_bounds[i].unsigned_p != unsigned_p
4082 : 4855 : || maybe_lt (lower_bounds[i].min_value, min_value))
4083 : : {
4084 : 780 : lower_bounds[i].unsigned_p = unsigned_p;
4085 : 780 : lower_bounds[i].min_value = min_value;
4086 : 780 : if (dump_enabled_p ())
4087 : : {
4088 : 250 : dump_printf_loc (MSG_NOTE, vect_location,
4089 : : "updating run-time check to ");
4090 : 250 : dump_lower_bound (MSG_NOTE, lower_bounds[i]);
4091 : 250 : dump_printf (MSG_NOTE, "\n");
4092 : : }
4093 : : }
4094 : 4855 : return;
4095 : : }
4096 : :
4097 : 1740 : vec_lower_bound lower_bound (expr, unsigned_p, min_value);
4098 : 1740 : if (dump_enabled_p ())
4099 : : {
4100 : 211 : dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
4101 : 211 : dump_lower_bound (MSG_NOTE, lower_bound);
4102 : 211 : dump_printf (MSG_NOTE, "\n");
4103 : : }
4104 : 1740 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
4105 : : }
4106 : :
4107 : : /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
4108 : : will span fewer than GAP bytes. */
4109 : :
4110 : : static bool
4111 : 5265 : vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
4112 : : poly_int64 gap)
4113 : : {
4114 : 5265 : stmt_vec_info stmt_info = dr_info->stmt;
4115 : 5265 : HOST_WIDE_INT count
4116 : 5265 : = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
4117 : 5265 : if (DR_GROUP_FIRST_ELEMENT (stmt_info))
4118 : 4505 : count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
4119 : 5265 : return (estimated_poly_value (gap)
4120 : 5265 : <= count * vect_get_scalar_dr_size (dr_info));
4121 : : }
4122 : :
4123 : : /* Return true if we know that there is no alias between DR_INFO_A and
4124 : : DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
4125 : : When returning true, set *LOWER_BOUND_OUT to this N. */
4126 : :
4127 : : static bool
4128 : 18461 : vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
4129 : : poly_uint64 *lower_bound_out)
4130 : : {
4131 : : /* Check that there is a constant gap of known sign between DR_A
4132 : : and DR_B. */
4133 : 18461 : data_reference *dr_a = dr_info_a->dr;
4134 : 18461 : data_reference *dr_b = dr_info_b->dr;
4135 : 18461 : poly_int64 init_a, init_b;
4136 : 18461 : if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
4137 : 8061 : || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
4138 : 7363 : || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
4139 : 7353 : || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
4140 : 7353 : || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
4141 : 18461 : || !ordered_p (init_a, init_b))
4142 : 11108 : return false;
4143 : :
4144 : : /* Sort DR_A and DR_B by the address they access. */
4145 : 7353 : if (maybe_lt (init_b, init_a))
4146 : : {
4147 : 116 : std::swap (init_a, init_b);
4148 : 116 : std::swap (dr_info_a, dr_info_b);
4149 : 116 : std::swap (dr_a, dr_b);
4150 : : }
4151 : :
4152 : : /* If the two accesses could be dependent within a scalar iteration,
4153 : : make sure that we'd retain their order. */
4154 : 7353 : if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
4155 : 7353 : && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
4156 : : return false;
4157 : :
4158 : : /* There is no alias if abs (DR_STEP) is greater than or equal to
4159 : : the bytes spanned by the combination of the two accesses. */
4160 : 7353 : *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
4161 : 7353 : return true;
4162 : : }
4163 : :
4164 : : /* Function vect_prune_runtime_alias_test_list.
4165 : :
4166 : : Prune a list of ddrs to be tested at run-time by versioning for alias.
4167 : : Merge several alias checks into one if possible.
4168 : : Return FALSE if resulting list of ddrs is longer then allowed by
4169 : : PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
4170 : :
4171 : : opt_result
4172 : 366447 : vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
4173 : : {
4174 : 366447 : typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
4175 : 366447 : hash_set <tree_pair_hash> compared_objects;
4176 : :
4177 : 366447 : const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
4178 : 366447 : vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
4179 : : = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
4180 : 366447 : const vec<vec_object_pair> &check_unequal_addrs
4181 : : = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
4182 : 366447 : poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4183 : 366447 : tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
4184 : :
4185 : 366447 : ddr_p ddr;
4186 : 366447 : unsigned int i;
4187 : 366447 : tree length_factor;
4188 : :
4189 : 366447 : DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
4190 : :
4191 : : /* Step values are irrelevant for aliasing if the number of vector
4192 : : iterations is equal to the number of scalar iterations (which can
4193 : : happen for fully-SLP loops). */
4194 : 366447 : bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
4195 : :
4196 : 366447 : if (!vf_one_p)
4197 : : {
4198 : : /* Convert the checks for nonzero steps into bound tests. */
4199 : : tree value;
4200 : 364265 : FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
4201 : 1687 : vect_check_lower_bound (loop_vinfo, value, true, 1);
4202 : : }
4203 : :
4204 : 366447 : if (may_alias_ddrs.is_empty ())
4205 : 342424 : return opt_result::success ();
4206 : :
4207 : 24023 : comp_alias_ddrs.create (may_alias_ddrs.length ());
4208 : :
4209 : 24023 : unsigned int loop_depth
4210 : 24023 : = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
4211 : 24023 : LOOP_VINFO_LOOP_NEST (loop_vinfo));
4212 : :
4213 : : /* First, we collect all data ref pairs for aliasing checks. */
4214 : 92928 : FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
4215 : : {
4216 : 72803 : poly_uint64 lower_bound;
4217 : 72803 : tree segment_length_a, segment_length_b;
4218 : 72803 : unsigned HOST_WIDE_INT access_size_a, access_size_b;
4219 : 72803 : unsigned HOST_WIDE_INT align_a, align_b;
4220 : :
4221 : : /* Ignore the alias if the VF we chose ended up being no greater
4222 : : than the dependence distance. */
4223 : 72803 : if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
4224 : 10836 : continue;
4225 : :
4226 : 72772 : if (DDR_OBJECT_A (ddr))
4227 : : {
4228 : 76 : vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
4229 : 76 : if (!compared_objects.add (new_pair))
4230 : : {
4231 : 18 : if (dump_enabled_p ())
4232 : 12 : dump_printf_loc (MSG_NOTE, vect_location,
4233 : : "checking that %T and %T"
4234 : : " have different addresses\n",
4235 : : new_pair.first, new_pair.second);
4236 : 18 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
4237 : : }
4238 : 76 : continue;
4239 : 76 : }
4240 : :
4241 : 72696 : dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
4242 : 72696 : stmt_vec_info stmt_info_a = dr_info_a->stmt;
4243 : :
4244 : 72696 : dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
4245 : 72696 : stmt_vec_info stmt_info_b = dr_info_b->stmt;
4246 : :
4247 : 72696 : bool preserves_scalar_order_p
4248 : 72696 : = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
4249 : 72696 : bool ignore_step_p
4250 : : = (vf_one_p
4251 : 72696 : && (preserves_scalar_order_p
4252 : 2967 : || operand_equal_p (DR_STEP (dr_info_a->dr),
4253 : 2967 : DR_STEP (dr_info_b->dr))));
4254 : :
4255 : : /* Skip the pair if inter-iteration dependencies are irrelevant
4256 : : and intra-iteration dependencies are guaranteed to be honored. */
4257 : 10965 : if (ignore_step_p
4258 : 5603 : && (preserves_scalar_order_p
4259 : 2622 : || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4260 : : &lower_bound)))
4261 : : {
4262 : 5362 : if (dump_enabled_p ())
4263 : 2496 : dump_printf_loc (MSG_NOTE, vect_location,
4264 : : "no need for alias check between "
4265 : : "%T and %T when VF is 1\n",
4266 : 2496 : DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4267 : 5362 : continue;
4268 : : }
4269 : :
4270 : : /* See whether we can handle the alias using a bounds check on
4271 : : the step, and whether that's likely to be the best approach.
4272 : : (It might not be, for example, if the minimum step is much larger
4273 : : than the number of bytes handled by one vector iteration.) */
4274 : 67334 : if (!ignore_step_p
4275 : 67093 : && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
4276 : 15839 : && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4277 : : &lower_bound)
4278 : 72306 : && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
4279 : 293 : || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
4280 : : {
4281 : 4908 : bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
4282 : 4908 : if (dump_enabled_p ())
4283 : : {
4284 : 3384 : dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
4285 : : "%T and %T when the step %T is outside ",
4286 : : DR_REF (dr_info_a->dr),
4287 : 1692 : DR_REF (dr_info_b->dr),
4288 : 1692 : DR_STEP (dr_info_a->dr));
4289 : 1692 : if (unsigned_p)
4290 : 504 : dump_printf (MSG_NOTE, "[0");
4291 : : else
4292 : : {
4293 : 1188 : dump_printf (MSG_NOTE, "(");
4294 : 1188 : dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4295 : : }
4296 : 1692 : dump_printf (MSG_NOTE, ", ");
4297 : 1692 : dump_dec (MSG_NOTE, lower_bound);
4298 : 1692 : dump_printf (MSG_NOTE, ")\n");
4299 : : }
4300 : 4908 : vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4301 : : unsigned_p, lower_bound);
4302 : 4908 : continue;
4303 : 4908 : }
4304 : :
4305 : 62426 : stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4306 : 62426 : if (dr_group_first_a)
4307 : : {
4308 : 19014 : stmt_info_a = dr_group_first_a;
4309 : 19014 : dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4310 : : }
4311 : :
4312 : 62426 : stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4313 : 62426 : if (dr_group_first_b)
4314 : : {
4315 : 19444 : stmt_info_b = dr_group_first_b;
4316 : 19444 : dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4317 : : }
4318 : :
4319 : 62426 : if (ignore_step_p)
4320 : : {
4321 : 241 : segment_length_a = size_zero_node;
4322 : 241 : segment_length_b = size_zero_node;
4323 : : }
4324 : : else
4325 : : {
4326 : 62185 : if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4327 : 62185 : DR_STEP (dr_info_b->dr), 0))
4328 : : {
4329 : 13196 : length_factor = scalar_loop_iters;
4330 : 13196 : if (TREE_CODE (length_factor) == SCEV_NOT_KNOWN)
4331 : 12 : return opt_result::failure_at (vect_location,
4332 : : "Unsupported alias check on"
4333 : : " uncounted loop\n");
4334 : : }
4335 : : else
4336 : 48989 : length_factor = size_int (vect_factor);
4337 : 62173 : segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4338 : 62173 : segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4339 : : }
4340 : 62414 : access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4341 : 62414 : access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4342 : 62414 : align_a = vect_vfa_align (dr_info_a);
4343 : 62414 : align_b = vect_vfa_align (dr_info_b);
4344 : :
4345 : : /* See whether the alias is known at compilation time. */
4346 : 62414 : if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4347 : 62414 : DR_BASE_ADDRESS (dr_info_b->dr), 0)
4348 : 6126 : && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4349 : 6126 : DR_OFFSET (dr_info_b->dr), 0)
4350 : 4477 : && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4351 : 4403 : && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4352 : 4393 : && poly_int_tree_p (segment_length_a)
4353 : 66770 : && poly_int_tree_p (segment_length_b))
4354 : : {
4355 : 4345 : int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4356 : : segment_length_a,
4357 : : segment_length_b,
4358 : : access_size_a,
4359 : : access_size_b);
4360 : 4345 : if (res >= 0 && dump_enabled_p ())
4361 : : {
4362 : 208 : dump_printf_loc (MSG_NOTE, vect_location,
4363 : : "can tell at compile time that %T and %T",
4364 : 104 : DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4365 : 104 : if (res == 0)
4366 : 57 : dump_printf (MSG_NOTE, " do not alias\n");
4367 : : else
4368 : 47 : dump_printf (MSG_NOTE, " alias\n");
4369 : : }
4370 : :
4371 : 4345 : if (res == 0)
4372 : 459 : continue;
4373 : :
4374 : 3886 : if (res == 1)
4375 : 3886 : return opt_result::failure_at (stmt_info_b->stmt,
4376 : : "not vectorized:"
4377 : : " compilation time alias: %G%G",
4378 : : stmt_info_a->stmt,
4379 : : stmt_info_b->stmt);
4380 : : }
4381 : :
4382 : : /* dr_with_seg_len requires the alignment to apply to the segment length
4383 : : and access size, not just the start address. The access size can be
4384 : : smaller than the pointer alignment for grouped accesses and bitfield
4385 : : references; see PR115192 and PR116125 respectively. */
4386 : 58069 : align_a = std::min (align_a, least_bit_hwi (access_size_a));
4387 : 58069 : align_b = std::min (align_b, least_bit_hwi (access_size_b));
4388 : :
4389 : 58069 : dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4390 : 58069 : access_size_a, align_a);
4391 : 58069 : dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4392 : 58069 : access_size_b, align_b);
4393 : : /* Canonicalize the order to be the one that's needed for accurate
4394 : : RAW, WAR and WAW flags, in cases where the data references are
4395 : : well-ordered. The order doesn't really matter otherwise,
4396 : : but we might as well be consistent. */
4397 : 58069 : if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4398 : 4309 : std::swap (dr_a, dr_b);
4399 : :
4400 : 58069 : dr_with_seg_len_pair_t dr_with_seg_len_pair
4401 : : (dr_a, dr_b, (preserves_scalar_order_p
4402 : : ? dr_with_seg_len_pair_t::WELL_ORDERED
4403 : 63617 : : dr_with_seg_len_pair_t::REORDERED));
4404 : :
4405 : 58069 : comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4406 : : }
4407 : :
4408 : 20125 : prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4409 : :
4410 : 40250 : unsigned int count = (comp_alias_ddrs.length ()
4411 : 20125 : + check_unequal_addrs.length ());
4412 : :
4413 : 20125 : if (count
4414 : 20125 : && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4415 : : == VECT_COST_MODEL_VERY_CHEAP))
4416 : 13525 : return opt_result::failure_at
4417 : 13525 : (vect_location, "would need a runtime alias check\n");
4418 : :
4419 : 6600 : if (dump_enabled_p ())
4420 : 1904 : dump_printf_loc (MSG_NOTE, vect_location,
4421 : : "improved number of alias checks from %d to %d\n",
4422 : : may_alias_ddrs.length (), count);
4423 : 6600 : unsigned limit = param_vect_max_version_for_alias_checks;
4424 : 6600 : if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4425 : 757 : limit = param_vect_max_version_for_alias_checks * 6 / 10;
4426 : 6600 : if (count > limit)
4427 : 162 : return opt_result::failure_at
4428 : 162 : (vect_location,
4429 : : "number of versioning for alias run-time tests exceeds %d "
4430 : : "(--param vect-max-version-for-alias-checks)\n", limit);
4431 : :
4432 : 6438 : return opt_result::success ();
4433 : 366447 : }
4434 : :
4435 : : /* Structure to hold information about a supported gather/scatter
4436 : : configuration. */
4437 : : struct gather_scatter_config
4438 : : {
4439 : : internal_fn ifn;
4440 : : tree offset_vectype;
4441 : : int scale;
4442 : : vec<int> elsvals;
4443 : : };
4444 : :
4445 : : /* Determine which gather/scatter IFN is supported for the given parameters.
4446 : : IFN_MASK_GATHER_LOAD, IFN_GATHER_LOAD, and IFN_MASK_LEN_GATHER_LOAD
4447 : : are mutually exclusive, so we only need to find one. Return the
4448 : : supported IFN or IFN_LAST if none are supported. */
4449 : :
4450 : : static internal_fn
4451 : 1015492 : vect_gather_scatter_which_ifn (bool read_p, bool masked_p,
4452 : : tree vectype, tree memory_type,
4453 : : tree offset_vectype, int scale,
4454 : : vec<int> *elsvals)
4455 : : {
4456 : : /* Work out which functions to try. */
4457 : 1015492 : internal_fn ifn, alt_ifn, alt_ifn2;
4458 : 1015492 : if (read_p)
4459 : : {
4460 : 702956 : ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4461 : : alt_ifn = IFN_MASK_GATHER_LOAD;
4462 : : alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4463 : : }
4464 : : else
4465 : : {
4466 : 312536 : ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4467 : : alt_ifn = IFN_MASK_SCATTER_STORE;
4468 : : alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4469 : : }
4470 : :
4471 : 1015492 : if (!offset_vectype)
4472 : : return IFN_LAST;
4473 : :
4474 : 1015492 : if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4475 : : offset_vectype, scale, elsvals))
4476 : : return ifn;
4477 : 1015492 : if (internal_gather_scatter_fn_supported_p (alt_ifn, vectype, memory_type,
4478 : : offset_vectype, scale, elsvals))
4479 : : return alt_ifn;
4480 : 1015492 : if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype, memory_type,
4481 : : offset_vectype, scale, elsvals))
4482 : : return alt_ifn2;
4483 : :
4484 : : return IFN_LAST;
4485 : : }
4486 : :
4487 : : /* Collect all supported offset vector types for a gather load or scatter
4488 : : store. READ_P is true for loads and false for stores. MASKED_P is true
4489 : : if the load or store is conditional. VECTYPE is the data vector type.
4490 : : MEMORY_TYPE is the type of the memory elements being loaded or stored,
4491 : : and OFFSET_TYPE is the type of the offset.
4492 : : SCALE is the amount by which the offset should be multiplied.
4493 : :
4494 : : Return a vector of all configurations the target supports (which can
4495 : : be none). */
4496 : :
4497 : : static auto_vec<gather_scatter_config>
4498 : 76331 : vect_gather_scatter_get_configs (vec_info *vinfo, bool read_p, bool masked_p,
4499 : : tree vectype, tree memory_type,
4500 : : tree offset_type, int scale)
4501 : : {
4502 : 76331 : auto_vec<gather_scatter_config> configs;
4503 : :
4504 : 76331 : auto_vec<tree, 8> offset_types_to_try;
4505 : :
4506 : : /* Try all sizes from the offset type's precision up to POINTER_SIZE. */
4507 : 76331 : for (unsigned int bits = TYPE_PRECISION (offset_type);
4508 : 358489 : bits <= POINTER_SIZE;
4509 : 268466 : bits *= 2)
4510 : : {
4511 : : /* Signed variant. */
4512 : 268466 : offset_types_to_try.safe_push
4513 : 268466 : (build_nonstandard_integer_type (bits, 0));
4514 : : /* Unsigned variant. */
4515 : 268466 : offset_types_to_try.safe_push
4516 : 268466 : (build_nonstandard_integer_type (bits, 1));
4517 : : }
4518 : :
4519 : : /* Once we find which IFN works for one offset type, we know that it
4520 : : will work for other offset types as well. Then we can perform
4521 : : the checks for the remaining offset types with only that IFN.
4522 : : However, we might need to try different offset types to find which
4523 : : IFN is supported, since the check is offset-type-specific. */
4524 : : internal_fn ifn = IFN_LAST;
4525 : :
4526 : : /* Try each offset type. */
4527 : 613263 : for (unsigned int i = 0; i < offset_types_to_try.length (); i++)
4528 : : {
4529 : 536932 : tree offset_type = offset_types_to_try[i];
4530 : 536932 : tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4531 : 536932 : if (!offset_vectype)
4532 : 9498 : continue;
4533 : :
4534 : : /* Try multiple scale values. Start with exact match, then try
4535 : : smaller common scales that a target might support . */
4536 : 527434 : int scales_to_try[] = {scale, 1, 2, 4, 8};
4537 : :
4538 : 3164604 : for (unsigned int j = 0;
4539 : 3164604 : j < sizeof (scales_to_try) / sizeof (*scales_to_try);
4540 : : j++)
4541 : : {
4542 : 2637170 : int try_scale = scales_to_try[j];
4543 : :
4544 : : /* Skip scales >= requested scale (except for exact match). */
4545 : 2637170 : if (j > 0 && try_scale >= scale)
4546 : 1621678 : continue;
4547 : :
4548 : : /* Skip if requested scale is not a multiple of this scale. */
4549 : 1015636 : if (j > 0 && scale % try_scale != 0)
4550 : 144 : continue;
4551 : :
4552 : 1015492 : vec<int> elsvals = vNULL;
4553 : :
4554 : : /* If we haven't determined which IFN is supported yet, try all three
4555 : : to find which one the target supports. */
4556 : 1015492 : if (ifn == IFN_LAST)
4557 : : {
4558 : 1015492 : ifn = vect_gather_scatter_which_ifn (read_p, masked_p,
4559 : : vectype, memory_type,
4560 : : offset_vectype, try_scale,
4561 : : &elsvals);
4562 : 1015492 : if (ifn != IFN_LAST)
4563 : : {
4564 : : /* Found which IFN is supported. Save this configuration. */
4565 : 0 : gather_scatter_config config;
4566 : 0 : config.ifn = ifn;
4567 : 0 : config.offset_vectype = offset_vectype;
4568 : 0 : config.scale = try_scale;
4569 : 0 : config.elsvals = elsvals;
4570 : 0 : configs.safe_push (config);
4571 : : }
4572 : : }
4573 : : else
4574 : : {
4575 : : /* We already know which IFN is supported, just check if this
4576 : : offset type and scale work with it. */
4577 : 0 : if (internal_gather_scatter_fn_supported_p (ifn, vectype,
4578 : : memory_type,
4579 : : offset_vectype,
4580 : : try_scale,
4581 : : &elsvals))
4582 : : {
4583 : 0 : gather_scatter_config config;
4584 : 0 : config.ifn = ifn;
4585 : 0 : config.offset_vectype = offset_vectype;
4586 : 0 : config.scale = try_scale;
4587 : 0 : config.elsvals = elsvals;
4588 : 0 : configs.safe_push (config);
4589 : : }
4590 : : }
4591 : : }
4592 : : }
4593 : :
4594 : 76331 : return configs;
4595 : 76331 : }
4596 : :
4597 : : /* Check whether we can use an internal function for a gather load
4598 : : or scatter store. READ_P is true for loads and false for stores.
4599 : : MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
4600 : : the type of the memory elements being loaded or stored. OFFSET_TYPE
4601 : : is the type of the offset that is being applied to the invariant
4602 : : base address. If OFFSET_TYPE is scalar the function chooses an
4603 : : appropriate vector type for it. SCALE is the amount by which the
4604 : : offset should be multiplied *after* it has been converted to address width.
4605 : : If the target does not support the requested SCALE, SUPPORTED_SCALE
4606 : : will contain the scale that is actually supported
4607 : : (which may be smaller, requiring additional multiplication).
4608 : : Otherwise SUPPORTED_SCALE is 0.
4609 : :
4610 : : Return true if the function is supported, storing the function id in
4611 : : *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.
4612 : : If we support an offset vector type with different signedness than
4613 : : OFFSET_TYPE store it in SUPPORTED_OFFSET_VECTYPE.
4614 : :
4615 : : If we can use gather/scatter and ELSVALS is nonzero, store the possible
4616 : : else values in ELSVALS. */
4617 : :
4618 : : bool
4619 : 76331 : vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4620 : : tree vectype, tree memory_type, tree offset_type,
4621 : : int scale, int *supported_scale,
4622 : : internal_fn *ifn_out,
4623 : : tree *offset_vectype_out,
4624 : : tree *supported_offset_vectype,
4625 : : vec<int> *elsvals)
4626 : : {
4627 : 76331 : *supported_offset_vectype = NULL_TREE;
4628 : 76331 : *supported_scale = 0;
4629 : 76331 : unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4630 : 76331 : unsigned int element_bits = vector_element_bits (vectype);
4631 : 76331 : if (element_bits != memory_bits)
4632 : : /* For now the vector elements must be the same width as the
4633 : : memory elements. */
4634 : : return false;
4635 : :
4636 : : /* Get the original offset vector type for comparison. */
4637 : 76331 : tree offset_vectype = VECTOR_TYPE_P (offset_type)
4638 : 76331 : ? offset_type : get_vectype_for_scalar_type (vinfo, offset_type);
4639 : :
4640 : : /* If there is no offset vectype, bail. */
4641 : 63446 : if (!offset_vectype)
4642 : : return false;
4643 : :
4644 : 76331 : offset_type = TREE_TYPE (offset_vectype);
4645 : :
4646 : : /* Get all supported configurations for this data vector type. */
4647 : 76331 : auto_vec<gather_scatter_config> configs
4648 : : = vect_gather_scatter_get_configs (vinfo, read_p, masked_p, vectype,
4649 : 76331 : memory_type, offset_type, scale);
4650 : :
4651 : 76331 : if (configs.is_empty ())
4652 : : return false;
4653 : :
4654 : : /* Selection priority:
4655 : : 1 - Exact scale match + offset type match
4656 : : 2 - Exact scale match + sign-swapped offset
4657 : : 3 - Smaller scale + offset type match
4658 : : 4 - Smaller scale + sign-swapped offset
4659 : : Within each category, prefer smaller offset types. */
4660 : :
4661 : : /* First pass: exact scale match with no conversion. */
4662 : 0 : for (unsigned int i = 0; i < configs.length (); i++)
4663 : : {
4664 : 0 : if (configs[i].scale == scale
4665 : 0 : && TYPE_SIGN (configs[i].offset_vectype)
4666 : 0 : == TYPE_SIGN (offset_vectype))
4667 : : {
4668 : 0 : *ifn_out = configs[i].ifn;
4669 : 0 : *offset_vectype_out = configs[i].offset_vectype;
4670 : 0 : if (elsvals)
4671 : 0 : *elsvals = configs[i].elsvals;
4672 : 0 : return true;
4673 : : }
4674 : : }
4675 : :
4676 : : /* No direct match. This means we try to find either
4677 : : - a sign-swapped offset vectype or
4678 : : - a different scale and 2x larger offset type
4679 : : - a different scale and larger sign-swapped offset vectype. */
4680 : 0 : unsigned int offset_precision = TYPE_PRECISION (TREE_TYPE (offset_vectype));
4681 : 0 : unsigned int needed_precision
4682 : 0 : = TYPE_UNSIGNED (offset_vectype) ? offset_precision * 2 : POINTER_SIZE;
4683 : 0 : needed_precision = std::min (needed_precision, (unsigned) POINTER_SIZE);
4684 : :
4685 : : /* Second pass: No direct match. This means we try to find a sign-swapped
4686 : : offset vectype. */
4687 : 0 : enum tree_code tmp;
4688 : 0 : for (unsigned int i = 0; i < configs.length (); i++)
4689 : : {
4690 : 0 : unsigned int precision
4691 : 0 : = TYPE_PRECISION (TREE_TYPE (configs[i].offset_vectype));
4692 : 0 : if (configs[i].scale == scale
4693 : 0 : && precision >= needed_precision
4694 : 0 : && (supportable_convert_operation (CONVERT_EXPR,
4695 : 0 : configs[i].offset_vectype,
4696 : : offset_vectype, &tmp)
4697 : 0 : || (needed_precision == offset_precision
4698 : 0 : && tree_nop_conversion_p (configs[i].offset_vectype,
4699 : : offset_vectype))))
4700 : : {
4701 : 0 : *ifn_out = configs[i].ifn;
4702 : 0 : *offset_vectype_out = offset_vectype;
4703 : 0 : *supported_offset_vectype = configs[i].offset_vectype;
4704 : 0 : if (elsvals)
4705 : 0 : *elsvals = configs[i].elsvals;
4706 : 0 : return true;
4707 : : }
4708 : : }
4709 : :
4710 : : /* Third pass: Try a smaller scale with the same signedness. */
4711 : 0 : needed_precision = offset_precision * 2;
4712 : 0 : needed_precision = std::min (needed_precision, (unsigned) POINTER_SIZE);
4713 : :
4714 : 0 : for (unsigned int i = 0; i < configs.length (); i++)
4715 : : {
4716 : 0 : unsigned int precision
4717 : 0 : = TYPE_PRECISION (TREE_TYPE (configs[i].offset_vectype));
4718 : 0 : if (configs[i].scale < scale
4719 : 0 : && TYPE_SIGN (configs[i].offset_vectype)
4720 : 0 : == TYPE_SIGN (offset_vectype)
4721 : 0 : && precision >= needed_precision)
4722 : : {
4723 : 0 : *ifn_out = configs[i].ifn;
4724 : 0 : *offset_vectype_out = configs[i].offset_vectype;
4725 : 0 : *supported_scale = configs[i].scale;
4726 : 0 : if (elsvals)
4727 : 0 : *elsvals = configs[i].elsvals;
4728 : 0 : return true;
4729 : : }
4730 : : }
4731 : :
4732 : : /* Fourth pass: Try a smaller scale and sign-swapped offset vectype. */
4733 : 0 : needed_precision
4734 : 0 : = TYPE_UNSIGNED (offset_vectype) ? offset_precision * 2 : POINTER_SIZE;
4735 : 0 : needed_precision = std::min (needed_precision, (unsigned) POINTER_SIZE);
4736 : :
4737 : 0 : for (unsigned int i = 0; i < configs.length (); i++)
4738 : : {
4739 : 0 : unsigned int precision
4740 : 0 : = TYPE_PRECISION (TREE_TYPE (configs[i].offset_vectype));
4741 : 0 : if (configs[i].scale < scale
4742 : 0 : && precision >= needed_precision
4743 : 0 : && (supportable_convert_operation (CONVERT_EXPR,
4744 : 0 : configs[i].offset_vectype,
4745 : : offset_vectype, &tmp)
4746 : 0 : || (needed_precision == offset_precision
4747 : 0 : && tree_nop_conversion_p (configs[i].offset_vectype,
4748 : : offset_vectype))))
4749 : : {
4750 : 0 : *ifn_out = configs[i].ifn;
4751 : 0 : *offset_vectype_out = offset_vectype;
4752 : 0 : *supported_offset_vectype = configs[i].offset_vectype;
4753 : 0 : *supported_scale = configs[i].scale;
4754 : 0 : if (elsvals)
4755 : 0 : *elsvals = configs[i].elsvals;
4756 : 0 : return true;
4757 : : }
4758 : : }
4759 : :
4760 : : return false;
4761 : 76331 : }
4762 : :
4763 : : /* STMT_INFO is a call to an internal gather load or scatter store function.
4764 : : Describe the operation in INFO. */
4765 : :
4766 : : void
4767 : 0 : vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4768 : : gather_scatter_info *info)
4769 : : {
4770 : 0 : gcall *call = as_a <gcall *> (stmt_info->stmt);
4771 : 0 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4772 : 0 : data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4773 : :
4774 : 0 : info->ifn = gimple_call_internal_fn (call);
4775 : 0 : info->decl = NULL_TREE;
4776 : 0 : info->base = gimple_call_arg (call, 0);
4777 : 0 : info->alias_ptr = gimple_call_arg
4778 : 0 : (call, internal_fn_alias_ptr_index (info->ifn));
4779 : 0 : info->offset = gimple_call_arg
4780 : 0 : (call, internal_fn_offset_index (info->ifn));
4781 : 0 : info->offset_vectype = NULL_TREE;
4782 : 0 : info->scale = TREE_INT_CST_LOW (gimple_call_arg
4783 : : (call, internal_fn_scale_index (info->ifn)));
4784 : 0 : info->element_type = TREE_TYPE (vectype);
4785 : 0 : info->memory_type = TREE_TYPE (DR_REF (dr));
4786 : 0 : }
4787 : :
4788 : : /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4789 : : gather load or scatter store with VECTYPE. Describe the operation in *INFO
4790 : : if so. If it is suitable and ELSVALS is nonzero store the supported else
4791 : : values in the vector it points to. */
4792 : :
4793 : : bool
4794 : 364108 : vect_check_gather_scatter (stmt_vec_info stmt_info, tree vectype,
4795 : : loop_vec_info loop_vinfo,
4796 : : gather_scatter_info *info, vec<int> *elsvals)
4797 : : {
4798 : 364108 : HOST_WIDE_INT scale = 1;
4799 : 364108 : poly_int64 pbitpos, pbitsize;
4800 : 364108 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4801 : 364108 : struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4802 : 364108 : tree offtype = NULL_TREE;
4803 : 364108 : tree decl = NULL_TREE, base, off;
4804 : 364108 : tree memory_type = TREE_TYPE (DR_REF (dr));
4805 : 364108 : machine_mode pmode;
4806 : 364108 : int punsignedp, reversep, pvolatilep = 0;
4807 : 364108 : internal_fn ifn;
4808 : 364108 : tree offset_vectype;
4809 : 364108 : bool masked_p = false;
4810 : :
4811 : : /* See whether this is already a call to a gather/scatter internal function.
4812 : : If not, see whether it's a masked load or store. */
4813 : 364108 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4814 : 5663 : if (call && gimple_call_internal_p (call))
4815 : : {
4816 : 5663 : ifn = gimple_call_internal_fn (call);
4817 : 5663 : if (internal_gather_scatter_fn_p (ifn))
4818 : : {
4819 : 0 : vect_describe_gather_scatter_call (stmt_info, info);
4820 : :
4821 : : /* In pattern recog we simply used a ZERO else value that
4822 : : we need to correct here. To that end just re-use the
4823 : : (already succesful) check if we support a gather IFN
4824 : : and have it populate the else values. */
4825 : 0 : if (DR_IS_READ (dr) && internal_fn_mask_index (ifn) >= 0 && elsvals)
4826 : 0 : supports_vec_gather_load_p (TYPE_MODE (vectype), elsvals);
4827 : 0 : return true;
4828 : : }
4829 : 5663 : masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4830 : : }
4831 : :
4832 : : /* True if we should aim to use internal functions rather than
4833 : : built-in functions. */
4834 : 364108 : bool use_ifn_p = (DR_IS_READ (dr)
4835 : 364108 : ? supports_vec_gather_load_p (TYPE_MODE (vectype),
4836 : : elsvals)
4837 : 364108 : : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4838 : :
4839 : 364108 : base = DR_REF (dr);
4840 : : /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4841 : : see if we can use the def stmt of the address. */
4842 : 364108 : if (masked_p
4843 : 5663 : && TREE_CODE (base) == MEM_REF
4844 : 5663 : && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4845 : 5663 : && integer_zerop (TREE_OPERAND (base, 1))
4846 : 369771 : && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4847 : : {
4848 : 5663 : gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4849 : 5663 : if (is_gimple_assign (def_stmt)
4850 : 5663 : && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4851 : 556 : base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4852 : : }
4853 : :
4854 : : /* The gather and scatter builtins need address of the form
4855 : : loop_invariant + vector * {1, 2, 4, 8}
4856 : : or
4857 : : loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4858 : : Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4859 : : of loop invariants/SSA_NAMEs defined in the loop, with casts,
4860 : : multiplications and additions in it. To get a vector, we need
4861 : : a single SSA_NAME that will be defined in the loop and will
4862 : : contain everything that is not loop invariant and that can be
4863 : : vectorized. The following code attempts to find such a preexistng
4864 : : SSA_NAME OFF and put the loop invariants into a tree BASE
4865 : : that can be gimplified before the loop. */
4866 : 364108 : base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4867 : : &punsignedp, &reversep, &pvolatilep);
4868 : 364108 : if (reversep)
4869 : : return false;
4870 : :
4871 : : /* PR 107346. Packed structs can have fields at offsets that are not
4872 : : multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4873 : 364108 : if (!multiple_p (pbitpos, BITS_PER_UNIT))
4874 : : return false;
4875 : :
4876 : : /* We need to be able to form an address to the base which for example
4877 : : isn't possible for hard registers. */
4878 : 364108 : if (may_be_nonaddressable_p (base))
4879 : : return false;
4880 : :
4881 : 364100 : poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4882 : :
4883 : 364100 : if (TREE_CODE (base) == MEM_REF)
4884 : : {
4885 : 292601 : if (!integer_zerop (TREE_OPERAND (base, 1)))
4886 : : {
4887 : 34014 : if (off == NULL_TREE)
4888 : 33697 : off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4889 : : else
4890 : 317 : off = size_binop (PLUS_EXPR, off,
4891 : : fold_convert (sizetype, TREE_OPERAND (base, 1)));
4892 : : }
4893 : 292601 : base = TREE_OPERAND (base, 0);
4894 : : }
4895 : : else
4896 : 71499 : base = build_fold_addr_expr (base);
4897 : :
4898 : 364100 : if (off == NULL_TREE)
4899 : 238932 : off = size_zero_node;
4900 : :
4901 : : /* BASE must be loop invariant. If it is not invariant, but OFF is, then we
4902 : : * can fix that by swapping BASE and OFF. */
4903 : 364100 : if (!expr_invariant_in_loop_p (loop, base))
4904 : : {
4905 : 273176 : if (!expr_invariant_in_loop_p (loop, off))
4906 : : return false;
4907 : :
4908 : 272855 : std::swap (base, off);
4909 : : }
4910 : :
4911 : 363779 : base = fold_convert (sizetype, base);
4912 : 363779 : base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4913 : 363779 : int tmp_scale;
4914 : 363779 : tree tmp_offset_vectype;
4915 : :
4916 : : /* OFF at this point may be either a SSA_NAME or some tree expression
4917 : : from get_inner_reference. Try to peel off loop invariants from it
4918 : : into BASE as long as possible. */
4919 : 363779 : STRIP_NOPS (off);
4920 : 939666 : while (offtype == NULL_TREE)
4921 : : {
4922 : 805666 : enum tree_code code;
4923 : 805666 : tree op0, op1, add = NULL_TREE;
4924 : :
4925 : 805666 : if (TREE_CODE (off) == SSA_NAME)
4926 : : {
4927 : 633048 : gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4928 : :
4929 : 633048 : if (expr_invariant_in_loop_p (loop, off))
4930 : 0 : return false;
4931 : :
4932 : 633048 : if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4933 : : break;
4934 : :
4935 : 505973 : op0 = gimple_assign_rhs1 (def_stmt);
4936 : 505973 : code = gimple_assign_rhs_code (def_stmt);
4937 : 505973 : op1 = gimple_assign_rhs2 (def_stmt);
4938 : : }
4939 : : else
4940 : : {
4941 : 172618 : if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4942 : : return false;
4943 : 172618 : code = TREE_CODE (off);
4944 : 172618 : extract_ops_from_tree (off, &code, &op0, &op1);
4945 : : }
4946 : 678591 : switch (code)
4947 : : {
4948 : 210413 : case POINTER_PLUS_EXPR:
4949 : 210413 : case PLUS_EXPR:
4950 : 210413 : if (expr_invariant_in_loop_p (loop, op0))
4951 : : {
4952 : 146319 : add = op0;
4953 : 146319 : off = op1;
4954 : 194403 : do_add:
4955 : 194403 : add = fold_convert (sizetype, add);
4956 : 194403 : if (scale != 1)
4957 : 46425 : add = size_binop (MULT_EXPR, add, size_int (scale));
4958 : 194403 : base = size_binop (PLUS_EXPR, base, add);
4959 : 575887 : continue;
4960 : : }
4961 : 64094 : if (expr_invariant_in_loop_p (loop, op1))
4962 : : {
4963 : 47890 : add = op1;
4964 : 47890 : off = op0;
4965 : 47890 : goto do_add;
4966 : : }
4967 : : break;
4968 : 398 : case MINUS_EXPR:
4969 : 398 : if (expr_invariant_in_loop_p (loop, op1))
4970 : : {
4971 : 194 : add = fold_convert (sizetype, op1);
4972 : 194 : add = size_binop (MINUS_EXPR, size_zero_node, add);
4973 : 194 : off = op0;
4974 : 194 : goto do_add;
4975 : : }
4976 : : break;
4977 : 197550 : case MULT_EXPR:
4978 : 197550 : if (scale == 1 && tree_fits_shwi_p (op1))
4979 : : {
4980 : 163508 : int new_scale = tree_to_shwi (op1);
4981 : : /* Only treat this as a scaling operation if the target
4982 : : supports it for at least some offset type. */
4983 : 163508 : if (use_ifn_p
4984 : 0 : && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4985 : : masked_p, vectype, memory_type,
4986 : : signed_char_type_node,
4987 : : new_scale, &tmp_scale,
4988 : : &ifn,
4989 : : &offset_vectype,
4990 : : &tmp_offset_vectype,
4991 : : elsvals)
4992 : 163508 : && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4993 : : masked_p, vectype, memory_type,
4994 : : unsigned_char_type_node,
4995 : : new_scale, &tmp_scale,
4996 : : &ifn,
4997 : : &offset_vectype,
4998 : : &tmp_offset_vectype,
4999 : : elsvals))
5000 : : break;
5001 : 163508 : scale = new_scale;
5002 : 163508 : off = op0;
5003 : 163508 : continue;
5004 : 163508 : }
5005 : : break;
5006 : 0 : case SSA_NAME:
5007 : 0 : off = op0;
5008 : 0 : continue;
5009 : 224314 : CASE_CONVERT:
5010 : 448612 : if (!POINTER_TYPE_P (TREE_TYPE (op0))
5011 : 448612 : && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
5012 : : break;
5013 : :
5014 : : /* Don't include the conversion if the target is happy with
5015 : : the current offset type. */
5016 : 224314 : if (use_ifn_p
5017 : 0 : && TREE_CODE (off) == SSA_NAME
5018 : 0 : && !POINTER_TYPE_P (TREE_TYPE (off))
5019 : 224314 : && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
5020 : : masked_p, vectype, memory_type,
5021 : 0 : TREE_TYPE (off),
5022 : : scale, &tmp_scale,
5023 : : &ifn,
5024 : : &offset_vectype,
5025 : : &tmp_offset_vectype,
5026 : : elsvals))
5027 : : break;
5028 : :
5029 : 224314 : if (TYPE_PRECISION (TREE_TYPE (op0))
5030 : 224314 : == TYPE_PRECISION (TREE_TYPE (off)))
5031 : : {
5032 : 83976 : off = op0;
5033 : 83976 : continue;
5034 : : }
5035 : :
5036 : : /* Include the conversion if it is widening and we're using
5037 : : the IFN path or the target can handle the converted from
5038 : : offset or the current size is not already the same as the
5039 : : data vector element size. */
5040 : 140338 : if ((TYPE_PRECISION (TREE_TYPE (op0))
5041 : 140338 : < TYPE_PRECISION (TREE_TYPE (off)))
5042 : 140338 : && (use_ifn_p
5043 : 139111 : || (DR_IS_READ (dr)
5044 : 84141 : ? (targetm.vectorize.builtin_gather
5045 : 84141 : && targetm.vectorize.builtin_gather (vectype,
5046 : 84141 : TREE_TYPE (op0),
5047 : : scale))
5048 : 54970 : : (targetm.vectorize.builtin_scatter
5049 : 54970 : && targetm.vectorize.builtin_scatter (vectype,
5050 : 54970 : TREE_TYPE (op0),
5051 : : scale)))
5052 : 138423 : || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
5053 : 138423 : TYPE_SIZE (TREE_TYPE (vectype)), 0)))
5054 : : {
5055 : 134000 : off = op0;
5056 : 134000 : offtype = TREE_TYPE (off);
5057 : 134000 : STRIP_NOPS (off);
5058 : 134000 : continue;
5059 : : }
5060 : : break;
5061 : : default:
5062 : : break;
5063 : 0 : }
5064 : : break;
5065 : : }
5066 : :
5067 : : /* If at the end OFF still isn't a SSA_NAME or isn't
5068 : : defined in the loop, punt. */
5069 : 363779 : if (TREE_CODE (off) != SSA_NAME
5070 : 363779 : || expr_invariant_in_loop_p (loop, off))
5071 : 6827 : return false;
5072 : :
5073 : 356952 : if (offtype == NULL_TREE)
5074 : 223336 : offtype = TREE_TYPE (off);
5075 : :
5076 : 356952 : if (use_ifn_p)
5077 : : {
5078 : 0 : if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
5079 : : vectype, memory_type, offtype,
5080 : : scale, &tmp_scale,
5081 : : &ifn, &offset_vectype,
5082 : : &tmp_offset_vectype,
5083 : : elsvals))
5084 : 0 : ifn = IFN_LAST;
5085 : : decl = NULL_TREE;
5086 : : }
5087 : : else
5088 : : {
5089 : 356952 : if (DR_IS_READ (dr))
5090 : : {
5091 : 261287 : if (targetm.vectorize.builtin_gather)
5092 : 261287 : decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
5093 : : }
5094 : : else
5095 : : {
5096 : 95665 : if (targetm.vectorize.builtin_scatter)
5097 : 95665 : decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
5098 : : }
5099 : 356952 : ifn = IFN_LAST;
5100 : : /* The offset vector type will be read from DECL when needed. */
5101 : 356952 : offset_vectype = NULL_TREE;
5102 : : }
5103 : :
5104 : 356952 : gcc_checking_assert (expr_invariant_in_loop_p (loop, base));
5105 : 356952 : gcc_checking_assert (!expr_invariant_in_loop_p (loop, off));
5106 : :
5107 : 356952 : info->ifn = ifn;
5108 : 356952 : info->decl = decl;
5109 : 356952 : info->base = base;
5110 : :
5111 : 713904 : info->alias_ptr = build_int_cst
5112 : 356952 : (reference_alias_ptr_type (DR_REF (dr)),
5113 : 356952 : get_object_alignment (DR_REF (dr)));
5114 : :
5115 : 356952 : info->offset = off;
5116 : 356952 : info->offset_vectype = offset_vectype;
5117 : 356952 : info->scale = scale;
5118 : 356952 : info->element_type = TREE_TYPE (vectype);
5119 : 356952 : info->memory_type = memory_type;
5120 : 356952 : return true;
5121 : : }
5122 : :
5123 : : /* Find the data references in STMT, analyze them with respect to LOOP and
5124 : : append them to DATAREFS. Return false if datarefs in this stmt cannot
5125 : : be handled. */
5126 : :
5127 : : opt_result
5128 : 32306153 : vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
5129 : : vec<data_reference_p> *datarefs,
5130 : : vec<int> *dataref_groups, int group_id)
5131 : : {
5132 : : /* We can ignore clobbers for dataref analysis - they are removed during
5133 : : loop vectorization and BB vectorization checks dependences with a
5134 : : stmt walk. */
5135 : 32306153 : if (gimple_clobber_p (stmt))
5136 : 1093194 : return opt_result::success ();
5137 : :
5138 : 58035697 : if (gimple_has_volatile_ops (stmt))
5139 : 319797 : return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
5140 : : stmt);
5141 : :
5142 : 30893162 : if (stmt_can_throw_internal (cfun, stmt))
5143 : 697994 : return opt_result::failure_at (stmt,
5144 : : "not vectorized:"
5145 : : " statement can throw an exception: %G",
5146 : : stmt);
5147 : :
5148 : 30195168 : auto_vec<data_reference_p, 2> refs;
5149 : 30195168 : opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
5150 : 30195168 : if (!res)
5151 : 3660057 : return res;
5152 : :
5153 : 26535111 : if (refs.is_empty ())
5154 : 15266261 : return opt_result::success ();
5155 : :
5156 : 11268850 : if (refs.length () > 1)
5157 : : {
5158 : 1258447 : while (!refs.is_empty ())
5159 : 839267 : free_data_ref (refs.pop ());
5160 : 419180 : return opt_result::failure_at (stmt,
5161 : : "not vectorized: more than one "
5162 : : "data ref in stmt: %G", stmt);
5163 : : }
5164 : :
5165 : 10849670 : data_reference_p dr = refs.pop ();
5166 : 10849670 : if (gcall *call = dyn_cast <gcall *> (stmt))
5167 : 18352 : if (!gimple_call_internal_p (call)
5168 : 18352 : || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
5169 : 15274 : && gimple_call_internal_fn (call) != IFN_MASK_STORE))
5170 : : {
5171 : 14778 : free_data_ref (dr);
5172 : 14778 : return opt_result::failure_at (stmt,
5173 : : "not vectorized: dr in a call %G", stmt);
5174 : : }
5175 : :
5176 : 10834892 : if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
5177 : 10834892 : && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
5178 : : {
5179 : 64136 : free_data_ref (dr);
5180 : 64136 : return opt_result::failure_at (stmt,
5181 : : "not vectorized:"
5182 : : " statement is an unsupported"
5183 : : " bitfield access %G", stmt);
5184 : : }
5185 : :
5186 : 10770756 : if (DR_BASE_ADDRESS (dr)
5187 : 10678684 : && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
5188 : : {
5189 : 998 : free_data_ref (dr);
5190 : 998 : return opt_result::failure_at (stmt,
5191 : : "not vectorized:"
5192 : : " base addr of dr is a constant\n");
5193 : : }
5194 : :
5195 : : /* Check whether this may be a SIMD lane access and adjust the
5196 : : DR to make it easier for us to handle it. */
5197 : 10769758 : if (loop
5198 : 600662 : && loop->simduid
5199 : 10711 : && (!DR_BASE_ADDRESS (dr)
5200 : 2960 : || !DR_OFFSET (dr)
5201 : 2960 : || !DR_INIT (dr)
5202 : 2960 : || !DR_STEP (dr)))
5203 : : {
5204 : 7751 : struct data_reference *newdr
5205 : 7751 : = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
5206 : 7751 : DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
5207 : 7751 : if (DR_BASE_ADDRESS (newdr)
5208 : 7751 : && DR_OFFSET (newdr)
5209 : 7751 : && DR_INIT (newdr)
5210 : 7751 : && DR_STEP (newdr)
5211 : 7751 : && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
5212 : 15502 : && integer_zerop (DR_STEP (newdr)))
5213 : : {
5214 : 7751 : tree base_address = DR_BASE_ADDRESS (newdr);
5215 : 7751 : tree off = DR_OFFSET (newdr);
5216 : 7751 : tree step = ssize_int (1);
5217 : 7751 : if (integer_zerop (off)
5218 : 7751 : && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
5219 : : {
5220 : 82 : off = TREE_OPERAND (base_address, 1);
5221 : 82 : base_address = TREE_OPERAND (base_address, 0);
5222 : : }
5223 : 7751 : STRIP_NOPS (off);
5224 : 7751 : if (TREE_CODE (off) == MULT_EXPR
5225 : 7751 : && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
5226 : : {
5227 : 7500 : step = TREE_OPERAND (off, 1);
5228 : 7500 : off = TREE_OPERAND (off, 0);
5229 : 7500 : STRIP_NOPS (off);
5230 : : }
5231 : 541 : if (CONVERT_EXPR_P (off)
5232 : 7751 : && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
5233 : 7210 : < TYPE_PRECISION (TREE_TYPE (off))))
5234 : 7210 : off = TREE_OPERAND (off, 0);
5235 : 7751 : if (TREE_CODE (off) == SSA_NAME)
5236 : : {
5237 : 7226 : gimple *def = SSA_NAME_DEF_STMT (off);
5238 : : /* Look through widening conversion. */
5239 : 7226 : if (is_gimple_assign (def)
5240 : 7226 : && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
5241 : : {
5242 : 0 : tree rhs1 = gimple_assign_rhs1 (def);
5243 : 0 : if (TREE_CODE (rhs1) == SSA_NAME
5244 : 0 : && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
5245 : 0 : && (TYPE_PRECISION (TREE_TYPE (off))
5246 : 0 : > TYPE_PRECISION (TREE_TYPE (rhs1))))
5247 : 0 : def = SSA_NAME_DEF_STMT (rhs1);
5248 : : }
5249 : 7226 : if (is_gimple_call (def)
5250 : 7090 : && gimple_call_internal_p (def)
5251 : 14316 : && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
5252 : : {
5253 : 7090 : tree arg = gimple_call_arg (def, 0);
5254 : 7090 : tree reft = TREE_TYPE (DR_REF (newdr));
5255 : 7090 : gcc_assert (TREE_CODE (arg) == SSA_NAME);
5256 : 7090 : arg = SSA_NAME_VAR (arg);
5257 : 7090 : if (arg == loop->simduid
5258 : : /* For now. */
5259 : 7090 : && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
5260 : : {
5261 : 7065 : DR_BASE_ADDRESS (newdr) = base_address;
5262 : 7065 : DR_OFFSET (newdr) = ssize_int (0);
5263 : 7065 : DR_STEP (newdr) = step;
5264 : 7065 : DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
5265 : 7065 : DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
5266 : : /* Mark as simd-lane access. */
5267 : 7065 : tree arg2 = gimple_call_arg (def, 1);
5268 : 7065 : newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
5269 : 7065 : free_data_ref (dr);
5270 : 7065 : datarefs->safe_push (newdr);
5271 : 7065 : if (dataref_groups)
5272 : 0 : dataref_groups->safe_push (group_id);
5273 : 7065 : return opt_result::success ();
5274 : : }
5275 : : }
5276 : : }
5277 : : }
5278 : 686 : free_data_ref (newdr);
5279 : : }
5280 : :
5281 : 10762693 : datarefs->safe_push (dr);
5282 : 10762693 : if (dataref_groups)
5283 : 10169096 : dataref_groups->safe_push (group_id);
5284 : 10762693 : return opt_result::success ();
5285 : 30195168 : }
5286 : :
5287 : : /* Function vect_analyze_data_refs.
5288 : :
5289 : : Find all the data references in the loop or basic block.
5290 : :
5291 : : The general structure of the analysis of data refs in the vectorizer is as
5292 : : follows:
5293 : : 1- vect_analyze_data_refs(loop/bb): call
5294 : : compute_data_dependences_for_loop/bb to find and analyze all data-refs
5295 : : in the loop/bb and their dependences.
5296 : : 2- vect_analyze_dependences(): apply dependence testing using ddrs.
5297 : : 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
5298 : : 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
5299 : :
5300 : : */
5301 : :
5302 : : opt_result
5303 : 2655866 : vect_analyze_data_refs (vec_info *vinfo, bool *fatal)
5304 : : {
5305 : 2655866 : class loop *loop = NULL;
5306 : 2655866 : unsigned int i;
5307 : 2655866 : struct data_reference *dr;
5308 : 2655866 : tree scalar_type;
5309 : :
5310 : 2655866 : DUMP_VECT_SCOPE ("vect_analyze_data_refs");
5311 : :
5312 : 2655866 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5313 : 433977 : loop = LOOP_VINFO_LOOP (loop_vinfo);
5314 : :
5315 : : /* Go through the data-refs, check that the analysis succeeded. Update
5316 : : pointer from stmt_vec_info struct to DR and vectype. */
5317 : :
5318 : 2655866 : vec<data_reference_p> datarefs = vinfo->shared->datarefs;
5319 : 17714344 : FOR_EACH_VEC_ELT (datarefs, i, dr)
5320 : : {
5321 : 15117059 : enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
5322 : :
5323 : 15117059 : gcc_assert (DR_REF (dr));
5324 : 15117059 : stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
5325 : 15117059 : gcc_assert (!stmt_info->dr_aux.dr);
5326 : 15117059 : stmt_info->dr_aux.dr = dr;
5327 : 15117059 : stmt_info->dr_aux.stmt = stmt_info;
5328 : :
5329 : : /* Check that analysis of the data-ref succeeded. */
5330 : 15117059 : if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
5331 : 14993811 : || !DR_STEP (dr))
5332 : : {
5333 : 246496 : bool maybe_gather
5334 : 123248 : = DR_IS_READ (dr)
5335 : 123248 : && !TREE_THIS_VOLATILE (DR_REF (dr));
5336 : 246496 : bool maybe_scatter
5337 : : = DR_IS_WRITE (dr)
5338 : 123248 : && !TREE_THIS_VOLATILE (DR_REF (dr));
5339 : :
5340 : : /* If target supports vector gather loads or scatter stores,
5341 : : see if they can't be used. */
5342 : 123248 : if (is_a <loop_vec_info> (vinfo)
5343 : 123248 : && !nested_in_vect_loop_p (loop, stmt_info))
5344 : : {
5345 : 119873 : if (maybe_gather || maybe_scatter)
5346 : : {
5347 : 119873 : if (maybe_gather)
5348 : : gatherscatter = GATHER;
5349 : : else
5350 : 25497 : gatherscatter = SCATTER;
5351 : : }
5352 : : }
5353 : :
5354 : 25497 : if (gatherscatter == SG_NONE)
5355 : : {
5356 : 3375 : if (dump_enabled_p ())
5357 : 5 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5358 : : "not vectorized: data ref analysis "
5359 : : "failed %G", stmt_info->stmt);
5360 : 3375 : if (is_a <bb_vec_info> (vinfo))
5361 : : {
5362 : : /* In BB vectorization the ref can still participate
5363 : : in dependence analysis, we just can't vectorize it. */
5364 : 3021 : STMT_VINFO_VECTORIZABLE (stmt_info) = false;
5365 : 3021 : continue;
5366 : : }
5367 : 354 : return opt_result::failure_at (stmt_info->stmt,
5368 : : "not vectorized:"
5369 : : " data ref analysis failed: %G",
5370 : : stmt_info->stmt);
5371 : : }
5372 : : }
5373 : :
5374 : : /* See if this was detected as SIMD lane access. */
5375 : 15113684 : if (dr->aux == (void *)-1
5376 : 15113684 : || dr->aux == (void *)-2
5377 : 15104782 : || dr->aux == (void *)-3
5378 : 15103942 : || dr->aux == (void *)-4)
5379 : : {
5380 : 10542 : if (nested_in_vect_loop_p (loop, stmt_info))
5381 : 0 : return opt_result::failure_at (stmt_info->stmt,
5382 : : "not vectorized:"
5383 : : " data ref analysis failed: %G",
5384 : : stmt_info->stmt);
5385 : 10542 : STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
5386 : 10542 : = -(uintptr_t) dr->aux;
5387 : : }
5388 : :
5389 : 15113684 : tree base = get_base_address (DR_REF (dr));
5390 : 15113684 : if (base && VAR_P (base) && DECL_NONALIASED (base))
5391 : : {
5392 : 8272 : if (dump_enabled_p ())
5393 : 186 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5394 : : "not vectorized: base object not addressable "
5395 : : "for stmt: %G", stmt_info->stmt);
5396 : 8272 : if (is_a <bb_vec_info> (vinfo))
5397 : : {
5398 : : /* In BB vectorization the ref can still participate
5399 : : in dependence analysis, we just can't vectorize it. */
5400 : 8272 : STMT_VINFO_VECTORIZABLE (stmt_info) = false;
5401 : 8272 : continue;
5402 : : }
5403 : 0 : return opt_result::failure_at (stmt_info->stmt,
5404 : : "not vectorized: base object not"
5405 : : " addressable for stmt: %G",
5406 : : stmt_info->stmt);
5407 : : }
5408 : :
5409 : 15105412 : if (is_a <loop_vec_info> (vinfo)
5410 : 979264 : && DR_STEP (dr)
5411 : 15964803 : && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
5412 : : {
5413 : 41394 : if (nested_in_vect_loop_p (loop, stmt_info))
5414 : 372 : return opt_result::failure_at (stmt_info->stmt,
5415 : : "not vectorized: "
5416 : : "not suitable for strided load %G",
5417 : : stmt_info->stmt);
5418 : 41022 : STMT_VINFO_STRIDED_P (stmt_info) = true;
5419 : : }
5420 : :
5421 : : /* Update DR field in stmt_vec_info struct. */
5422 : :
5423 : : /* If the dataref is in an inner-loop of the loop that is considered for
5424 : : for vectorization, we also want to analyze the access relative to
5425 : : the outer-loop (DR contains information only relative to the
5426 : : inner-most enclosing loop). We do that by building a reference to the
5427 : : first location accessed by the inner-loop, and analyze it relative to
5428 : : the outer-loop. */
5429 : 15105040 : if (loop && nested_in_vect_loop_p (loop, stmt_info))
5430 : : {
5431 : : /* Build a reference to the first location accessed by the
5432 : : inner loop: *(BASE + INIT + OFFSET). By construction,
5433 : : this address must be invariant in the inner loop, so we
5434 : : can consider it as being used in the outer loop. */
5435 : 11598 : tree base = unshare_expr (DR_BASE_ADDRESS (dr));
5436 : 11598 : tree offset = unshare_expr (DR_OFFSET (dr));
5437 : 11598 : tree init = unshare_expr (DR_INIT (dr));
5438 : 11598 : tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
5439 : : init, offset);
5440 : 11598 : tree init_addr = fold_build_pointer_plus (base, init_offset);
5441 : 11598 : tree init_ref = build_fold_indirect_ref (init_addr);
5442 : :
5443 : 11598 : if (dump_enabled_p ())
5444 : 1181 : dump_printf_loc (MSG_NOTE, vect_location,
5445 : : "analyze in outer loop: %T\n", init_ref);
5446 : :
5447 : 11598 : opt_result res
5448 : 11598 : = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
5449 : 11598 : init_ref, loop, stmt_info->stmt);
5450 : 11598 : if (!res)
5451 : : /* dr_analyze_innermost already explained the failure. */
5452 : 161 : return res;
5453 : :
5454 : 11437 : if (dump_enabled_p ())
5455 : 1177 : dump_printf_loc (MSG_NOTE, vect_location,
5456 : : "\touter base_address: %T\n"
5457 : : "\touter offset from base address: %T\n"
5458 : : "\touter constant offset from base address: %T\n"
5459 : : "\touter step: %T\n"
5460 : : "\touter base alignment: %d\n\n"
5461 : : "\touter base misalignment: %d\n"
5462 : : "\touter offset alignment: %d\n"
5463 : : "\touter step alignment: %d\n",
5464 : : STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
5465 : : STMT_VINFO_DR_OFFSET (stmt_info),
5466 : : STMT_VINFO_DR_INIT (stmt_info),
5467 : : STMT_VINFO_DR_STEP (stmt_info),
5468 : : STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
5469 : : STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
5470 : : STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
5471 : : STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
5472 : : }
5473 : :
5474 : : /* Set vectype for STMT. */
5475 : 15104879 : scalar_type = TREE_TYPE (DR_REF (dr));
5476 : 15104879 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
5477 : 15104879 : if (!vectype)
5478 : : {
5479 : 1817588 : if (dump_enabled_p ())
5480 : : {
5481 : 1798 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5482 : : "not vectorized: no vectype for stmt: %G",
5483 : : stmt_info->stmt);
5484 : 1798 : dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
5485 : 1798 : dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
5486 : : scalar_type);
5487 : 1798 : dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
5488 : : }
5489 : :
5490 : 1817588 : if (is_a <bb_vec_info> (vinfo))
5491 : : {
5492 : : /* No vector type is fine, the ref can still participate
5493 : : in dependence analysis, we just can't vectorize it. */
5494 : 1768169 : STMT_VINFO_VECTORIZABLE (stmt_info) = false;
5495 : 1768169 : continue;
5496 : : }
5497 : 49419 : if (fatal)
5498 : 49419 : *fatal = false;
5499 : 49419 : return opt_result::failure_at (stmt_info->stmt,
5500 : : "not vectorized:"
5501 : : " no vectype for stmt: %G"
5502 : : " scalar_type: %T\n",
5503 : : stmt_info->stmt, scalar_type);
5504 : : }
5505 : : else
5506 : : {
5507 : 13287291 : if (dump_enabled_p ())
5508 : 79150 : dump_printf_loc (MSG_NOTE, vect_location,
5509 : : "got vectype for stmt: %G%T\n",
5510 : : stmt_info->stmt, vectype);
5511 : : }
5512 : :
5513 : : /* Leave the BB vectorizer to pick the vector type later, based on
5514 : : the final dataref group size and SLP node size. */
5515 : 13287291 : if (is_a <loop_vec_info> (vinfo))
5516 : 929312 : STMT_VINFO_VECTYPE (stmt_info) = vectype;
5517 : :
5518 : 13287291 : if (gatherscatter != SG_NONE)
5519 : : {
5520 : 114248 : gather_scatter_info gs_info;
5521 : 114248 : if (!vect_check_gather_scatter (stmt_info, vectype,
5522 : : as_a <loop_vec_info> (vinfo),
5523 : : &gs_info)
5524 : 223972 : || !get_vectype_for_scalar_type (vinfo,
5525 : 109724 : TREE_TYPE (gs_info.offset)))
5526 : : {
5527 : 8275 : if (fatal)
5528 : 8275 : *fatal = false;
5529 : 8275 : return opt_result::failure_at
5530 : 8623 : (stmt_info->stmt,
5531 : : (gatherscatter == GATHER)
5532 : : ? "not vectorized: not suitable for gather load %G"
5533 : : : "not vectorized: not suitable for scatter store %G",
5534 : : stmt_info->stmt);
5535 : : }
5536 : 105973 : STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
5537 : : }
5538 : : }
5539 : :
5540 : : /* We used to stop processing and prune the list here. Verify we no
5541 : : longer need to. */
5542 : 4126855 : gcc_assert (i == datarefs.length ());
5543 : :
5544 : 2597285 : return opt_result::success ();
5545 : : }
5546 : :
5547 : :
5548 : : /* Function vect_get_new_vect_var.
5549 : :
5550 : : Returns a name for a new variable. The current naming scheme appends the
5551 : : prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
5552 : : the name of vectorizer generated variables, and appends that to NAME if
5553 : : provided. */
5554 : :
5555 : : tree
5556 : 1915110 : vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
5557 : : {
5558 : 1915110 : const char *prefix;
5559 : 1915110 : tree new_vect_var;
5560 : :
5561 : 1915110 : switch (var_kind)
5562 : : {
5563 : : case vect_simple_var:
5564 : : prefix = "vect";
5565 : : break;
5566 : 22888 : case vect_scalar_var:
5567 : 22888 : prefix = "stmp";
5568 : 22888 : break;
5569 : 20072 : case vect_mask_var:
5570 : 20072 : prefix = "mask";
5571 : 20072 : break;
5572 : 1375784 : case vect_pointer_var:
5573 : 1375784 : prefix = "vectp";
5574 : 1375784 : break;
5575 : 0 : default:
5576 : 0 : gcc_unreachable ();
5577 : : }
5578 : :
5579 : 1915110 : if (name)
5580 : : {
5581 : 1079849 : char* tmp = concat (prefix, "_", name, NULL);
5582 : 1079849 : new_vect_var = create_tmp_reg (type, tmp);
5583 : 1079849 : free (tmp);
5584 : : }
5585 : : else
5586 : 835261 : new_vect_var = create_tmp_reg (type, prefix);
5587 : :
5588 : 1915110 : return new_vect_var;
5589 : : }
5590 : :
5591 : : /* Like vect_get_new_vect_var but return an SSA name. */
5592 : :
5593 : : tree
5594 : 6857 : vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5595 : : {
5596 : 6857 : const char *prefix;
5597 : 6857 : tree new_vect_var;
5598 : :
5599 : 6857 : switch (var_kind)
5600 : : {
5601 : : case vect_simple_var:
5602 : : prefix = "vect";
5603 : : break;
5604 : 312 : case vect_scalar_var:
5605 : 312 : prefix = "stmp";
5606 : 312 : break;
5607 : 0 : case vect_pointer_var:
5608 : 0 : prefix = "vectp";
5609 : 0 : break;
5610 : 0 : default:
5611 : 0 : gcc_unreachable ();
5612 : : }
5613 : :
5614 : 6857 : if (name)
5615 : : {
5616 : 6380 : char* tmp = concat (prefix, "_", name, NULL);
5617 : 6380 : new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5618 : 6380 : free (tmp);
5619 : : }
5620 : : else
5621 : 477 : new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5622 : :
5623 : 6857 : return new_vect_var;
5624 : : }
5625 : :
5626 : : /* Duplicate points-to info on NAME from DR_INFO. */
5627 : :
5628 : : static void
5629 : 284529 : vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5630 : : {
5631 : 284529 : duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5632 : : /* DR_PTR_INFO is for a base SSA name, not including constant or
5633 : : variable offsets in the ref so its alignment info does not apply. */
5634 : 284529 : mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5635 : 284529 : }
5636 : :
5637 : : /* Function vect_create_addr_base_for_vector_ref.
5638 : :
5639 : : Create an expression that computes the address of the first memory location
5640 : : that will be accessed for a data reference.
5641 : :
5642 : : Input:
5643 : : STMT_INFO: The statement containing the data reference.
5644 : : NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5645 : : OFFSET: Optional. If supplied, it is be added to the initial address.
5646 : : LOOP: Specify relative to which loop-nest should the address be computed.
5647 : : For example, when the dataref is in an inner-loop nested in an
5648 : : outer-loop that is now being vectorized, LOOP can be either the
5649 : : outer-loop, or the inner-loop. The first memory location accessed
5650 : : by the following dataref ('in' points to short):
5651 : :
5652 : : for (i=0; i<N; i++)
5653 : : for (j=0; j<M; j++)
5654 : : s += in[i+j]
5655 : :
5656 : : is as follows:
5657 : : if LOOP=i_loop: &in (relative to i_loop)
5658 : : if LOOP=j_loop: &in+i*2B (relative to j_loop)
5659 : :
5660 : : Output:
5661 : : 1. Return an SSA_NAME whose value is the address of the memory location of
5662 : : the first vector of the data reference.
5663 : : 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5664 : : these statement(s) which define the returned SSA_NAME.
5665 : :
5666 : : FORNOW: We are only handling array accesses with step 1. */
5667 : :
5668 : : tree
5669 : 688056 : vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5670 : : gimple_seq *new_stmt_list,
5671 : : tree offset)
5672 : : {
5673 : 688056 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5674 : 688056 : struct data_reference *dr = dr_info->dr;
5675 : 688056 : const char *base_name;
5676 : 688056 : tree addr_base;
5677 : 688056 : tree dest;
5678 : 688056 : gimple_seq seq = NULL;
5679 : 688056 : tree vect_ptr_type;
5680 : 688056 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5681 : 688056 : innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5682 : :
5683 : 688056 : tree data_ref_base = unshare_expr (drb->base_address);
5684 : 688056 : tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5685 : 688056 : tree init = unshare_expr (drb->init);
5686 : :
5687 : 688056 : if (loop_vinfo)
5688 : 127394 : base_name = get_name (data_ref_base);
5689 : : else
5690 : : {
5691 : 560662 : base_offset = ssize_int (0);
5692 : 560662 : init = ssize_int (0);
5693 : 560662 : base_name = get_name (DR_REF (dr));
5694 : : }
5695 : :
5696 : : /* Create base_offset */
5697 : 688056 : base_offset = size_binop (PLUS_EXPR,
5698 : : fold_convert (sizetype, base_offset),
5699 : : fold_convert (sizetype, init));
5700 : :
5701 : 688056 : if (offset)
5702 : : {
5703 : 3148 : offset = fold_convert (sizetype, offset);
5704 : 3148 : base_offset = fold_build2 (PLUS_EXPR, sizetype,
5705 : : base_offset, offset);
5706 : : }
5707 : :
5708 : : /* base + base_offset */
5709 : 688056 : if (loop_vinfo)
5710 : 127394 : addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5711 : : else
5712 : 1121324 : addr_base = build1 (ADDR_EXPR,
5713 : 560662 : build_pointer_type (TREE_TYPE (DR_REF (dr))),
5714 : : /* Strip zero offset components since we don't need
5715 : : them and they can confuse late diagnostics if
5716 : : we CSE them wrongly. See PR106904 for example. */
5717 : : unshare_expr (strip_zero_offset_components
5718 : : (DR_REF (dr))));
5719 : :
5720 : 688056 : vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5721 : 688056 : dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5722 : 688056 : addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5723 : 688056 : gimple_seq_add_seq (new_stmt_list, seq);
5724 : :
5725 : 688056 : if (DR_PTR_INFO (dr)
5726 : 176878 : && TREE_CODE (addr_base) == SSA_NAME
5727 : : /* We should only duplicate pointer info to newly created SSA names. */
5728 : 864473 : && SSA_NAME_VAR (addr_base) == dest)
5729 : : {
5730 : 147999 : gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5731 : 147999 : vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5732 : : }
5733 : :
5734 : 688056 : if (dump_enabled_p ())
5735 : 24656 : dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5736 : :
5737 : 688056 : return addr_base;
5738 : : }
5739 : :
5740 : :
5741 : : /* Function vect_create_data_ref_ptr.
5742 : :
5743 : : Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5744 : : location accessed in the loop by STMT_INFO, along with the def-use update
5745 : : chain to appropriately advance the pointer through the loop iterations.
5746 : : Also set aliasing information for the pointer. This pointer is used by
5747 : : the callers to this function to create a memory reference expression for
5748 : : vector load/store access.
5749 : :
5750 : : Input:
5751 : : 1. STMT_INFO: a stmt that references memory. Expected to be of the form
5752 : : GIMPLE_ASSIGN <name, data-ref> or
5753 : : GIMPLE_ASSIGN <data-ref, name>.
5754 : : 2. AGGR_TYPE: the type of the reference, which should be either a vector
5755 : : or an array.
5756 : : 3. AT_LOOP: the loop where the vector memref is to be created.
5757 : : 4. OFFSET (optional): a byte offset to be added to the initial address
5758 : : accessed by the data-ref in STMT_INFO.
5759 : : 5. BSI: location where the new stmts are to be placed if there is no loop
5760 : : 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5761 : : pointing to the initial address.
5762 : : 8. IV_STEP (optional, defaults to NULL): the amount that should be added
5763 : : to the IV during each iteration of the loop. NULL says to move
5764 : : by one copy of AGGR_TYPE up or down, depending on the step of the
5765 : : data reference.
5766 : :
5767 : : Output:
5768 : : 1. Declare a new ptr to vector_type, and have it point to the base of the
5769 : : data reference (initial addressed accessed by the data reference).
5770 : : For example, for vector of type V8HI, the following code is generated:
5771 : :
5772 : : v8hi *ap;
5773 : : ap = (v8hi *)initial_address;
5774 : :
5775 : : if OFFSET is not supplied:
5776 : : initial_address = &a[init];
5777 : : if OFFSET is supplied:
5778 : : initial_address = &a[init] + OFFSET;
5779 : : if BYTE_OFFSET is supplied:
5780 : : initial_address = &a[init] + BYTE_OFFSET;
5781 : :
5782 : : Return the initial_address in INITIAL_ADDRESS.
5783 : :
5784 : : 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
5785 : : update the pointer in each iteration of the loop.
5786 : :
5787 : : Return the increment stmt that updates the pointer in PTR_INCR.
5788 : :
5789 : : 3. Return the pointer. */
5790 : :
5791 : : tree
5792 : 687728 : vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5793 : : tree aggr_type, class loop *at_loop, tree offset,
5794 : : tree *initial_address, gimple_stmt_iterator *gsi,
5795 : : gimple **ptr_incr, bool only_init,
5796 : : tree iv_step)
5797 : : {
5798 : 687728 : const char *base_name;
5799 : 687728 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5800 : 687728 : class loop *loop = NULL;
5801 : 687728 : bool nested_in_vect_loop = false;
5802 : 687728 : class loop *containing_loop = NULL;
5803 : 687728 : tree aggr_ptr_type;
5804 : 687728 : tree aggr_ptr;
5805 : 687728 : tree new_temp;
5806 : 687728 : gimple_seq new_stmt_list = NULL;
5807 : 687728 : edge pe = NULL;
5808 : 687728 : basic_block new_bb;
5809 : 687728 : tree aggr_ptr_init;
5810 : 687728 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5811 : 687728 : struct data_reference *dr = dr_info->dr;
5812 : 687728 : tree aptr;
5813 : 687728 : gimple_stmt_iterator incr_gsi;
5814 : 687728 : bool insert_after;
5815 : 687728 : tree indx_before_incr, indx_after_incr;
5816 : 687728 : gimple *incr;
5817 : 687728 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5818 : :
5819 : 687728 : gcc_assert (iv_step != NULL_TREE
5820 : : || TREE_CODE (aggr_type) == ARRAY_TYPE
5821 : : || TREE_CODE (aggr_type) == VECTOR_TYPE);
5822 : :
5823 : 687728 : if (loop_vinfo)
5824 : : {
5825 : 127066 : loop = LOOP_VINFO_LOOP (loop_vinfo);
5826 : 127066 : nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5827 : 127066 : containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5828 : 127066 : pe = loop_preheader_edge (loop);
5829 : : }
5830 : : else
5831 : : {
5832 : 560662 : gcc_assert (bb_vinfo);
5833 : 560662 : only_init = true;
5834 : 560662 : *ptr_incr = NULL;
5835 : : }
5836 : :
5837 : : /* Create an expression for the first address accessed by this load
5838 : : in LOOP. */
5839 : 687728 : base_name = get_name (DR_BASE_ADDRESS (dr));
5840 : :
5841 : 687728 : if (dump_enabled_p ())
5842 : : {
5843 : 24571 : tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5844 : 24571 : dump_printf_loc (MSG_NOTE, vect_location,
5845 : : "create %s-pointer variable to type: %T",
5846 : 24571 : get_tree_code_name (TREE_CODE (aggr_type)),
5847 : : aggr_type);
5848 : 24571 : if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5849 : 13272 : dump_printf (MSG_NOTE, " vectorizing an array ref: ");
5850 : 11299 : else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5851 : 0 : dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5852 : 11299 : else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5853 : 1608 : dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5854 : : else
5855 : 9691 : dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5856 : 24571 : dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5857 : : }
5858 : :
5859 : : /* (1) Create the new aggregate-pointer variable.
5860 : : Vector and array types inherit the alias set of their component
5861 : : type by default so we need to use a ref-all pointer if the data
5862 : : reference does not conflict with the created aggregated data
5863 : : reference because it is not addressable. */
5864 : 687728 : bool need_ref_all = false;
5865 : 687728 : if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5866 : : get_alias_set (DR_REF (dr))))
5867 : : need_ref_all = true;
5868 : : /* Likewise for any of the data references in the stmt group. */
5869 : 586102 : else if (DR_GROUP_SIZE (stmt_info) > 1)
5870 : : {
5871 : 473724 : stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5872 : 1318282 : do
5873 : : {
5874 : 1318282 : struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5875 : 1318282 : if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5876 : : get_alias_set (DR_REF (sdr))))
5877 : : {
5878 : : need_ref_all = true;
5879 : : break;
5880 : : }
5881 : 1317151 : sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5882 : : }
5883 : 1317151 : while (sinfo);
5884 : : }
5885 : 687728 : aggr_ptr_type = build_pointer_type_for_mode (aggr_type, VOIDmode,
5886 : : need_ref_all);
5887 : 687728 : aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5888 : :
5889 : :
5890 : : /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5891 : : vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5892 : : def-use update cycles for the pointer: one relative to the outer-loop
5893 : : (LOOP), which is what steps (3) and (4) below do. The other is relative
5894 : : to the inner-loop (which is the inner-most loop containing the dataref),
5895 : : and this is done be step (5) below.
5896 : :
5897 : : When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5898 : : inner-most loop, and so steps (3),(4) work the same, and step (5) is
5899 : : redundant. Steps (3),(4) create the following:
5900 : :
5901 : : vp0 = &base_addr;
5902 : : LOOP: vp1 = phi(vp0,vp2)
5903 : : ...
5904 : : ...
5905 : : vp2 = vp1 + step
5906 : : goto LOOP
5907 : :
5908 : : If there is an inner-loop nested in loop, then step (5) will also be
5909 : : applied, and an additional update in the inner-loop will be created:
5910 : :
5911 : : vp0 = &base_addr;
5912 : : LOOP: vp1 = phi(vp0,vp2)
5913 : : ...
5914 : : inner: vp3 = phi(vp1,vp4)
5915 : : vp4 = vp3 + inner_step
5916 : : if () goto inner
5917 : : ...
5918 : : vp2 = vp1 + step
5919 : : if () goto LOOP */
5920 : :
5921 : : /* (2) Calculate the initial address of the aggregate-pointer, and set
5922 : : the aggregate-pointer to point to it before the loop. */
5923 : :
5924 : : /* Create: (&(base[init_val]+offset) in the loop preheader. */
5925 : :
5926 : 687728 : new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5927 : : stmt_info, &new_stmt_list,
5928 : : offset);
5929 : 687728 : if (new_stmt_list)
5930 : : {
5931 : 170099 : if (pe)
5932 : : {
5933 : 54636 : new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5934 : 54636 : gcc_assert (!new_bb);
5935 : : }
5936 : : else
5937 : 115463 : gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5938 : : }
5939 : :
5940 : 687728 : *initial_address = new_temp;
5941 : 687728 : aggr_ptr_init = new_temp;
5942 : :
5943 : : /* (3) Handle the updating of the aggregate-pointer inside the loop.
5944 : : This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5945 : : inner-loop nested in LOOP (during outer-loop vectorization). */
5946 : :
5947 : : /* No update in loop is required. */
5948 : 687728 : if (only_init && (!loop_vinfo || at_loop == loop))
5949 : : aptr = aggr_ptr_init;
5950 : : else
5951 : : {
5952 : : /* Accesses to invariant addresses should be handled specially
5953 : : by the caller. */
5954 : 127058 : tree step = vect_dr_behavior (vinfo, dr_info)->step;
5955 : 127058 : gcc_assert (!integer_zerop (step));
5956 : :
5957 : 127058 : if (iv_step == NULL_TREE)
5958 : : {
5959 : : /* The step of the aggregate pointer is the type size,
5960 : : negated for downward accesses. */
5961 : 0 : iv_step = TYPE_SIZE_UNIT (aggr_type);
5962 : 0 : if (tree_int_cst_sgn (step) == -1)
5963 : 0 : iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5964 : : }
5965 : :
5966 : 127058 : standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5967 : :
5968 : 127058 : create_iv (aggr_ptr_init, PLUS_EXPR,
5969 : : iv_step, aggr_ptr, loop, &incr_gsi, insert_after,
5970 : : &indx_before_incr, &indx_after_incr);
5971 : 127058 : incr = gsi_stmt (incr_gsi);
5972 : :
5973 : : /* Copy the points-to information if it exists. */
5974 : 127058 : if (DR_PTR_INFO (dr))
5975 : : {
5976 : 68190 : vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5977 : 68190 : vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5978 : : }
5979 : 127058 : if (ptr_incr)
5980 : 127058 : *ptr_incr = incr;
5981 : :
5982 : 127058 : aptr = indx_before_incr;
5983 : : }
5984 : :
5985 : 687728 : if (!nested_in_vect_loop || only_init)
5986 : : return aptr;
5987 : :
5988 : :
5989 : : /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5990 : : nested in LOOP, if exists. */
5991 : :
5992 : 333 : gcc_assert (nested_in_vect_loop);
5993 : 333 : if (!only_init)
5994 : : {
5995 : 333 : standard_iv_increment_position (containing_loop, &incr_gsi,
5996 : : &insert_after);
5997 : 333 : create_iv (aptr, PLUS_EXPR, DR_STEP (dr),
5998 : : aggr_ptr, containing_loop, &incr_gsi, insert_after,
5999 : : &indx_before_incr, &indx_after_incr);
6000 : 333 : incr = gsi_stmt (incr_gsi);
6001 : :
6002 : : /* Copy the points-to information if it exists. */
6003 : 333 : if (DR_PTR_INFO (dr))
6004 : : {
6005 : 75 : vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
6006 : 75 : vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
6007 : : }
6008 : 333 : if (ptr_incr)
6009 : 333 : *ptr_incr = incr;
6010 : :
6011 : 333 : return indx_before_incr;
6012 : : }
6013 : : else
6014 : : gcc_unreachable ();
6015 : : }
6016 : :
6017 : :
6018 : : /* Function bump_vector_ptr
6019 : :
6020 : : Increment a pointer (to a vector type) by vector-size. If requested,
6021 : : i.e. if PTR-INCR is given, then also connect the new increment stmt
6022 : : to the existing def-use update-chain of the pointer, by modifying
6023 : : the PTR_INCR as illustrated below:
6024 : :
6025 : : The pointer def-use update-chain before this function:
6026 : : DATAREF_PTR = phi (p_0, p_2)
6027 : : ....
6028 : : PTR_INCR: p_2 = DATAREF_PTR + step
6029 : :
6030 : : The pointer def-use update-chain after this function:
6031 : : DATAREF_PTR = phi (p_0, p_2)
6032 : : ....
6033 : : NEW_DATAREF_PTR = DATAREF_PTR + BUMP
6034 : : ....
6035 : : PTR_INCR: p_2 = NEW_DATAREF_PTR + step
6036 : :
6037 : : Input:
6038 : : DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
6039 : : in the loop.
6040 : : PTR_INCR - optional. The stmt that updates the pointer in each iteration of
6041 : : the loop. The increment amount across iterations is expected
6042 : : to be vector_size.
6043 : : BSI - location where the new update stmt is to be placed.
6044 : : STMT_INFO - the original scalar memory-access stmt that is being vectorized.
6045 : : UPDATE - The offset by which to bump the pointer.
6046 : :
6047 : : Output: Return NEW_DATAREF_PTR as illustrated above.
6048 : :
6049 : : */
6050 : :
6051 : : tree
6052 : 232766 : bump_vector_ptr (vec_info *vinfo,
6053 : : tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
6054 : : stmt_vec_info stmt_info, tree update)
6055 : : {
6056 : 232766 : struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
6057 : 232766 : gimple *incr_stmt;
6058 : 232766 : ssa_op_iter iter;
6059 : 232766 : use_operand_p use_p;
6060 : 232766 : tree new_dataref_ptr;
6061 : :
6062 : 232766 : if (TREE_CODE (dataref_ptr) == SSA_NAME)
6063 : 106944 : new_dataref_ptr = copy_ssa_name (dataref_ptr);
6064 : 125822 : else if (is_gimple_min_invariant (dataref_ptr))
6065 : : /* When possible avoid emitting a separate increment stmt that will
6066 : : force the addressed object addressable. */
6067 : 251644 : return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
6068 : 125822 : fold_build2 (MEM_REF,
6069 : : TREE_TYPE (TREE_TYPE (dataref_ptr)),
6070 : : dataref_ptr,
6071 : 125822 : fold_convert (ptr_type_node, update)));
6072 : : else
6073 : 0 : new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
6074 : 106944 : incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
6075 : : dataref_ptr, update);
6076 : 106944 : vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
6077 : : /* Fold the increment, avoiding excessive chains use-def chains of
6078 : : those, leading to compile-time issues for passes until the next
6079 : : forwprop pass which would do this as well. */
6080 : 106944 : gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
6081 : 106944 : if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
6082 : : {
6083 : 68618 : incr_stmt = gsi_stmt (fold_gsi);
6084 : 68618 : update_stmt (incr_stmt);
6085 : : }
6086 : :
6087 : : /* Copy the points-to information if it exists. */
6088 : 106944 : if (DR_PTR_INFO (dr))
6089 : : {
6090 : 68649 : duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
6091 : 68649 : mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
6092 : : }
6093 : :
6094 : 106944 : if (!ptr_incr)
6095 : : return new_dataref_ptr;
6096 : :
6097 : : /* Update the vector-pointer's cross-iteration increment. */
6098 : 110470 : FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
6099 : : {
6100 : 55235 : tree use = USE_FROM_PTR (use_p);
6101 : :
6102 : 55235 : if (use == dataref_ptr)
6103 : 55235 : SET_USE (use_p, new_dataref_ptr);
6104 : : else
6105 : 0 : gcc_assert (operand_equal_p (use, update, 0));
6106 : : }
6107 : :
6108 : : return new_dataref_ptr;
6109 : : }
6110 : :
6111 : :
6112 : : /* Copy memory reference info such as base/clique from the SRC reference
6113 : : to the DEST MEM_REF. */
6114 : :
6115 : : void
6116 : 932968 : vect_copy_ref_info (tree dest, tree src)
6117 : : {
6118 : 932968 : if (TREE_CODE (dest) != MEM_REF)
6119 : : return;
6120 : :
6121 : : tree src_base = src;
6122 : 1876820 : while (handled_component_p (src_base))
6123 : 948523 : src_base = TREE_OPERAND (src_base, 0);
6124 : 928297 : if (TREE_CODE (src_base) != MEM_REF
6125 : 928297 : && TREE_CODE (src_base) != TARGET_MEM_REF)
6126 : : return;
6127 : :
6128 : 502936 : MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
6129 : 502936 : MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
6130 : : }
6131 : :
6132 : :
6133 : : /* Function vect_create_destination_var.
6134 : :
6135 : : Create a new temporary of type VECTYPE. */
6136 : :
6137 : : tree
6138 : 522743 : vect_create_destination_var (tree scalar_dest, tree vectype)
6139 : : {
6140 : 522743 : tree vec_dest;
6141 : 522743 : const char *name;
6142 : 522743 : char *new_name;
6143 : 522743 : tree type;
6144 : 522743 : enum vect_var_kind kind;
6145 : :
6146 : 522743 : kind = vectype
6147 : 1022598 : ? VECTOR_BOOLEAN_TYPE_P (vectype)
6148 : 499855 : ? vect_mask_var
6149 : : : vect_simple_var
6150 : : : vect_scalar_var;
6151 : 22888 : type = vectype ? vectype : TREE_TYPE (scalar_dest);
6152 : :
6153 : 522743 : gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
6154 : :
6155 : 522743 : name = get_name (scalar_dest);
6156 : 522743 : if (name)
6157 : 183148 : new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
6158 : : else
6159 : 339595 : new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
6160 : 522743 : vec_dest = vect_get_new_vect_var (type, kind, new_name);
6161 : 522743 : free (new_name);
6162 : :
6163 : 522743 : return vec_dest;
6164 : : }
6165 : :
6166 : : /* Function vect_grouped_store_supported.
6167 : :
6168 : : Returns TRUE if interleave high and interleave low permutations
6169 : : are supported, and FALSE otherwise. */
6170 : :
6171 : : bool
6172 : 2527 : vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
6173 : : {
6174 : 2527 : machine_mode mode = TYPE_MODE (vectype);
6175 : :
6176 : : /* vect_permute_store_chain requires the group size to be equal to 3 or
6177 : : be a power of two. */
6178 : 2527 : if (count != 3 && exact_log2 (count) == -1)
6179 : : {
6180 : 552 : if (dump_enabled_p ())
6181 : 11 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6182 : : "the size of the group of accesses"
6183 : : " is not a power of 2 or not eqaul to 3\n");
6184 : 552 : return false;
6185 : : }
6186 : :
6187 : : /* Check that the permutation is supported. */
6188 : 1975 : if (VECTOR_MODE_P (mode))
6189 : : {
6190 : 1975 : unsigned int i;
6191 : 1975 : if (count == 3)
6192 : : {
6193 : 921 : unsigned int j0 = 0, j1 = 0, j2 = 0;
6194 : 921 : unsigned int i, j;
6195 : :
6196 : 921 : unsigned int nelt;
6197 : 1842 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6198 : : {
6199 : : if (dump_enabled_p ())
6200 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6201 : : "cannot handle groups of 3 stores for"
6202 : : " variable-length vectors\n");
6203 : : return false;
6204 : : }
6205 : :
6206 : 921 : vec_perm_builder sel (nelt, nelt, 1);
6207 : 921 : sel.quick_grow (nelt);
6208 : 921 : vec_perm_indices indices;
6209 : 3459 : for (j = 0; j < 3; j++)
6210 : : {
6211 : 2613 : int nelt0 = ((3 - j) * nelt) % 3;
6212 : 2613 : int nelt1 = ((3 - j) * nelt + 1) % 3;
6213 : 2613 : int nelt2 = ((3 - j) * nelt + 2) % 3;
6214 : 9291 : for (i = 0; i < nelt; i++)
6215 : : {
6216 : 6678 : if (3 * i + nelt0 < nelt)
6217 : 2264 : sel[3 * i + nelt0] = j0++;
6218 : 6678 : if (3 * i + nelt1 < nelt)
6219 : 2225 : sel[3 * i + nelt1] = nelt + j1++;
6220 : 6678 : if (3 * i + nelt2 < nelt)
6221 : 2189 : sel[3 * i + nelt2] = 0;
6222 : : }
6223 : 2613 : indices.new_vector (sel, 2, nelt);
6224 : 2613 : if (!can_vec_perm_const_p (mode, mode, indices))
6225 : : {
6226 : 66 : if (dump_enabled_p ())
6227 : 37 : dump_printf (MSG_MISSED_OPTIMIZATION,
6228 : : "permutation op not supported by target.\n");
6229 : 66 : return false;
6230 : : }
6231 : :
6232 : 8649 : for (i = 0; i < nelt; i++)
6233 : : {
6234 : 6102 : if (3 * i + nelt0 < nelt)
6235 : 2040 : sel[3 * i + nelt0] = 3 * i + nelt0;
6236 : 6102 : if (3 * i + nelt1 < nelt)
6237 : 2031 : sel[3 * i + nelt1] = 3 * i + nelt1;
6238 : 6102 : if (3 * i + nelt2 < nelt)
6239 : 2031 : sel[3 * i + nelt2] = nelt + j2++;
6240 : : }
6241 : 2547 : indices.new_vector (sel, 2, nelt);
6242 : 2547 : if (!can_vec_perm_const_p (mode, mode, indices))
6243 : : {
6244 : 9 : if (dump_enabled_p ())
6245 : 9 : dump_printf (MSG_MISSED_OPTIMIZATION,
6246 : : "permutation op not supported by target.\n");
6247 : 9 : return false;
6248 : : }
6249 : : }
6250 : : return true;
6251 : 921 : }
6252 : : else
6253 : : {
6254 : : /* If length is not equal to 3 then only power of 2 is supported. */
6255 : 1054 : gcc_assert (pow2p_hwi (count));
6256 : 2108 : poly_uint64 nelt = GET_MODE_NUNITS (mode);
6257 : :
6258 : : /* The encoding has 2 interleaved stepped patterns. */
6259 : 2108 : if(!multiple_p (nelt, 2))
6260 : 1006 : return false;
6261 : 1054 : vec_perm_builder sel (nelt, 2, 3);
6262 : 1054 : sel.quick_grow (6);
6263 : 5270 : for (i = 0; i < 3; i++)
6264 : : {
6265 : 3162 : sel[i * 2] = i;
6266 : 3162 : sel[i * 2 + 1] = i + nelt;
6267 : : }
6268 : 1054 : vec_perm_indices indices (sel, 2, nelt);
6269 : 1054 : if (can_vec_perm_const_p (mode, mode, indices))
6270 : : {
6271 : 7042 : for (i = 0; i < 6; i++)
6272 : 6036 : sel[i] += exact_div (nelt, 2);
6273 : 1006 : indices.new_vector (sel, 2, nelt);
6274 : 1006 : if (can_vec_perm_const_p (mode, mode, indices))
6275 : 1006 : return true;
6276 : : }
6277 : 1054 : }
6278 : : }
6279 : :
6280 : 48 : if (dump_enabled_p ())
6281 : 3 : dump_printf (MSG_MISSED_OPTIMIZATION,
6282 : : "permutation op not supported by target.\n");
6283 : : return false;
6284 : : }
6285 : :
6286 : : /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
6287 : : of type VECTYPE. MASKED_P says whether the masked form is needed. */
6288 : :
6289 : : internal_fn
6290 : 32517 : vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6291 : : bool masked_p)
6292 : : {
6293 : 32517 : if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
6294 : : vec_mask_len_store_lanes_optab, vectype,
6295 : : count))
6296 : : return IFN_MASK_LEN_STORE_LANES;
6297 : 32517 : else if (masked_p)
6298 : : {
6299 : 159 : if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
6300 : : vec_mask_store_lanes_optab, vectype,
6301 : : count))
6302 : : return IFN_MASK_STORE_LANES;
6303 : : }
6304 : : else
6305 : : {
6306 : 32358 : if (vect_lanes_optab_supported_p ("vec_store_lanes",
6307 : : vec_store_lanes_optab, vectype, count))
6308 : : return IFN_STORE_LANES;
6309 : : }
6310 : : return IFN_LAST;
6311 : : }
6312 : :
6313 : :
6314 : : /* Function vect_setup_realignment
6315 : :
6316 : : This function is called when vectorizing an unaligned load using
6317 : : the dr_explicit_realign[_optimized] scheme.
6318 : : This function generates the following code at the loop prolog:
6319 : :
6320 : : p = initial_addr;
6321 : : x msq_init = *(floor(p)); # prolog load
6322 : : realignment_token = call target_builtin;
6323 : : loop:
6324 : : x msq = phi (msq_init, ---)
6325 : :
6326 : : The stmts marked with x are generated only for the case of
6327 : : dr_explicit_realign_optimized.
6328 : :
6329 : : The code above sets up a new (vector) pointer, pointing to the first
6330 : : location accessed by STMT_INFO, and a "floor-aligned" load using that
6331 : : pointer. It also generates code to compute the "realignment-token"
6332 : : (if the relevant target hook was defined), and creates a phi-node at the
6333 : : loop-header bb whose arguments are the result of the prolog-load (created
6334 : : by this function) and the result of a load that takes place in the loop
6335 : : (to be created by the caller to this function).
6336 : :
6337 : : For the case of dr_explicit_realign_optimized:
6338 : : The caller to this function uses the phi-result (msq) to create the
6339 : : realignment code inside the loop, and sets up the missing phi argument,
6340 : : as follows:
6341 : : loop:
6342 : : msq = phi (msq_init, lsq)
6343 : : lsq = *(floor(p')); # load in loop
6344 : : result = realign_load (msq, lsq, realignment_token);
6345 : :
6346 : : For the case of dr_explicit_realign:
6347 : : loop:
6348 : : msq = *(floor(p)); # load in loop
6349 : : p' = p + (VS-1);
6350 : : lsq = *(floor(p')); # load in loop
6351 : : result = realign_load (msq, lsq, realignment_token);
6352 : :
6353 : : Input:
6354 : : STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
6355 : : a memory location that may be unaligned.
6356 : : BSI - place where new code is to be inserted.
6357 : : ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6358 : : is used.
6359 : :
6360 : : Output:
6361 : : REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6362 : : target hook, if defined.
6363 : : Return value - the result of the loop-header phi node. */
6364 : :
6365 : : tree
6366 : 0 : vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info, tree vectype,
6367 : : gimple_stmt_iterator *gsi, tree *realignment_token,
6368 : : enum dr_alignment_support alignment_support_scheme,
6369 : : tree init_addr,
6370 : : class loop **at_loop)
6371 : : {
6372 : 0 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6373 : 0 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6374 : 0 : struct data_reference *dr = dr_info->dr;
6375 : 0 : class loop *loop = NULL;
6376 : 0 : edge pe = NULL;
6377 : 0 : tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6378 : 0 : tree vec_dest;
6379 : 0 : gimple *inc;
6380 : 0 : tree ptr;
6381 : 0 : tree data_ref;
6382 : 0 : basic_block new_bb;
6383 : 0 : tree msq_init = NULL_TREE;
6384 : 0 : tree new_temp;
6385 : 0 : gphi *phi_stmt;
6386 : 0 : tree msq = NULL_TREE;
6387 : 0 : gimple_seq stmts = NULL;
6388 : 0 : bool compute_in_loop = false;
6389 : 0 : bool nested_in_vect_loop = false;
6390 : 0 : class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6391 : 0 : class loop *loop_for_initial_load = NULL;
6392 : :
6393 : 0 : if (loop_vinfo)
6394 : : {
6395 : 0 : loop = LOOP_VINFO_LOOP (loop_vinfo);
6396 : 0 : nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6397 : : }
6398 : :
6399 : 0 : gcc_assert (alignment_support_scheme == dr_explicit_realign
6400 : : || alignment_support_scheme == dr_explicit_realign_optimized);
6401 : :
6402 : : /* We need to generate three things:
6403 : : 1. the misalignment computation
6404 : : 2. the extra vector load (for the optimized realignment scheme).
6405 : : 3. the phi node for the two vectors from which the realignment is
6406 : : done (for the optimized realignment scheme). */
6407 : :
6408 : : /* 1. Determine where to generate the misalignment computation.
6409 : :
6410 : : If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6411 : : calculation will be generated by this function, outside the loop (in the
6412 : : preheader). Otherwise, INIT_ADDR had already been computed for us by the
6413 : : caller, inside the loop.
6414 : :
6415 : : Background: If the misalignment remains fixed throughout the iterations of
6416 : : the loop, then both realignment schemes are applicable, and also the
6417 : : misalignment computation can be done outside LOOP. This is because we are
6418 : : vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6419 : : are a multiple of VS (the Vector Size), and therefore the misalignment in
6420 : : different vectorized LOOP iterations is always the same.
6421 : : The problem arises only if the memory access is in an inner-loop nested
6422 : : inside LOOP, which is now being vectorized using outer-loop vectorization.
6423 : : This is the only case when the misalignment of the memory access may not
6424 : : remain fixed throughout the iterations of the inner-loop (as explained in
6425 : : detail in vect_supportable_dr_alignment). In this case, not only is the
6426 : : optimized realignment scheme not applicable, but also the misalignment
6427 : : computation (and generation of the realignment token that is passed to
6428 : : REALIGN_LOAD) have to be done inside the loop.
6429 : :
6430 : : In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6431 : : or not, which in turn determines if the misalignment is computed inside
6432 : : the inner-loop, or outside LOOP. */
6433 : :
6434 : 0 : if (init_addr != NULL_TREE || !loop_vinfo)
6435 : : {
6436 : 0 : compute_in_loop = true;
6437 : 0 : gcc_assert (alignment_support_scheme == dr_explicit_realign);
6438 : : }
6439 : :
6440 : :
6441 : : /* 2. Determine where to generate the extra vector load.
6442 : :
6443 : : For the optimized realignment scheme, instead of generating two vector
6444 : : loads in each iteration, we generate a single extra vector load in the
6445 : : preheader of the loop, and in each iteration reuse the result of the
6446 : : vector load from the previous iteration. In case the memory access is in
6447 : : an inner-loop nested inside LOOP, which is now being vectorized using
6448 : : outer-loop vectorization, we need to determine whether this initial vector
6449 : : load should be generated at the preheader of the inner-loop, or can be
6450 : : generated at the preheader of LOOP. If the memory access has no evolution
6451 : : in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6452 : : to be generated inside LOOP (in the preheader of the inner-loop). */
6453 : :
6454 : 0 : if (nested_in_vect_loop)
6455 : : {
6456 : 0 : tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6457 : 0 : bool invariant_in_outerloop =
6458 : 0 : (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6459 : 0 : loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6460 : : }
6461 : : else
6462 : : loop_for_initial_load = loop;
6463 : 0 : if (at_loop)
6464 : 0 : *at_loop = loop_for_initial_load;
6465 : :
6466 : 0 : tree vuse = NULL_TREE;
6467 : 0 : if (loop_for_initial_load)
6468 : : {
6469 : 0 : pe = loop_preheader_edge (loop_for_initial_load);
6470 : 0 : if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6471 : 0 : vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6472 : : }
6473 : 0 : if (!vuse)
6474 : 0 : vuse = gimple_vuse (gsi_stmt (*gsi));
6475 : :
6476 : : /* 3. For the case of the optimized realignment, create the first vector
6477 : : load at the loop preheader. */
6478 : :
6479 : 0 : if (alignment_support_scheme == dr_explicit_realign_optimized)
6480 : : {
6481 : : /* Create msq_init = *(floor(p1)) in the loop preheader */
6482 : 0 : gassign *new_stmt;
6483 : :
6484 : 0 : gcc_assert (!compute_in_loop);
6485 : 0 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
6486 : 0 : ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6487 : : loop_for_initial_load, NULL_TREE,
6488 : : &init_addr, NULL, &inc, true);
6489 : 0 : if (TREE_CODE (ptr) == SSA_NAME)
6490 : 0 : new_temp = copy_ssa_name (ptr);
6491 : : else
6492 : 0 : new_temp = make_ssa_name (TREE_TYPE (ptr));
6493 : 0 : poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6494 : 0 : tree type = TREE_TYPE (ptr);
6495 : 0 : new_stmt = gimple_build_assign
6496 : 0 : (new_temp, BIT_AND_EXPR, ptr,
6497 : 0 : fold_build2 (MINUS_EXPR, type,
6498 : : build_int_cst (type, 0),
6499 : : build_int_cst (type, align)));
6500 : 0 : new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6501 : 0 : gcc_assert (!new_bb);
6502 : 0 : data_ref
6503 : 0 : = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6504 : : build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6505 : 0 : vect_copy_ref_info (data_ref, DR_REF (dr));
6506 : 0 : new_stmt = gimple_build_assign (vec_dest, data_ref);
6507 : 0 : new_temp = make_ssa_name (vec_dest, new_stmt);
6508 : 0 : gimple_assign_set_lhs (new_stmt, new_temp);
6509 : 0 : gimple_set_vuse (new_stmt, vuse);
6510 : 0 : if (pe)
6511 : : {
6512 : 0 : new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6513 : 0 : gcc_assert (!new_bb);
6514 : : }
6515 : : else
6516 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6517 : :
6518 : 0 : msq_init = gimple_assign_lhs (new_stmt);
6519 : : }
6520 : :
6521 : : /* 4. Create realignment token using a target builtin, if available.
6522 : : It is done either inside the containing loop, or before LOOP (as
6523 : : determined above). */
6524 : :
6525 : 0 : if (targetm.vectorize.builtin_mask_for_load)
6526 : : {
6527 : 0 : gcall *new_stmt;
6528 : 0 : tree builtin_decl;
6529 : :
6530 : : /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
6531 : 0 : if (!init_addr)
6532 : : {
6533 : : /* Generate the INIT_ADDR computation outside LOOP. */
6534 : 0 : init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6535 : : stmt_info, &stmts,
6536 : : NULL_TREE);
6537 : 0 : if (loop)
6538 : : {
6539 : 0 : pe = loop_preheader_edge (loop);
6540 : 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6541 : 0 : gcc_assert (!new_bb);
6542 : : }
6543 : : else
6544 : 0 : gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6545 : : }
6546 : :
6547 : 0 : builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6548 : 0 : new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6549 : 0 : vec_dest =
6550 : 0 : vect_create_destination_var (scalar_dest,
6551 : : gimple_call_return_type (new_stmt));
6552 : 0 : new_temp = make_ssa_name (vec_dest, new_stmt);
6553 : 0 : gimple_call_set_lhs (new_stmt, new_temp);
6554 : :
6555 : 0 : if (compute_in_loop)
6556 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6557 : : else
6558 : : {
6559 : : /* Generate the misalignment computation outside LOOP. */
6560 : 0 : pe = loop_preheader_edge (loop);
6561 : 0 : new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6562 : 0 : gcc_assert (!new_bb);
6563 : : }
6564 : :
6565 : 0 : *realignment_token = gimple_call_lhs (new_stmt);
6566 : :
6567 : : /* The result of the CALL_EXPR to this builtin is determined from
6568 : : the value of the parameter and no global variables are touched
6569 : : which makes the builtin a "const" function. Requiring the
6570 : : builtin to have the "const" attribute makes it unnecessary
6571 : : to call mark_call_clobbered. */
6572 : 0 : gcc_assert (TREE_READONLY (builtin_decl));
6573 : : }
6574 : :
6575 : 0 : if (alignment_support_scheme == dr_explicit_realign)
6576 : : return msq;
6577 : :
6578 : 0 : gcc_assert (!compute_in_loop);
6579 : 0 : gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6580 : :
6581 : :
6582 : : /* 5. Create msq = phi <msq_init, lsq> in loop */
6583 : :
6584 : 0 : pe = loop_preheader_edge (containing_loop);
6585 : 0 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
6586 : 0 : msq = make_ssa_name (vec_dest);
6587 : 0 : phi_stmt = create_phi_node (msq, containing_loop->header);
6588 : 0 : add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6589 : :
6590 : 0 : return msq;
6591 : : }
6592 : :
6593 : :
6594 : : /* Function vect_grouped_load_supported.
6595 : :
6596 : : COUNT is the size of the load group (the number of statements plus the
6597 : : number of gaps). SINGLE_ELEMENT_P is true if there is actually
6598 : : only one statement, with a gap of COUNT - 1.
6599 : :
6600 : : Returns true if a suitable permute exists. */
6601 : :
6602 : : bool
6603 : 1681 : vect_grouped_load_supported (tree vectype, bool single_element_p,
6604 : : unsigned HOST_WIDE_INT count)
6605 : : {
6606 : 1681 : machine_mode mode = TYPE_MODE (vectype);
6607 : :
6608 : : /* If this is single-element interleaving with an element distance
6609 : : that leaves unused vector loads around punt - we at least create
6610 : : very sub-optimal code in that case (and blow up memory,
6611 : : see PR65518). */
6612 : 1681 : if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6613 : : {
6614 : 23 : if (dump_enabled_p ())
6615 : 3 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6616 : : "single-element interleaving not supported "
6617 : : "for not adjacent vector loads\n");
6618 : 23 : return false;
6619 : : }
6620 : :
6621 : : /* vect_permute_load_chain requires the group size to be equal to 3 or
6622 : : be a power of two. */
6623 : 1658 : if (count != 3 && exact_log2 (count) == -1)
6624 : : {
6625 : 222 : if (dump_enabled_p ())
6626 : 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6627 : : "the size of the group of accesses"
6628 : : " is not a power of 2 or not equal to 3\n");
6629 : 222 : return false;
6630 : : }
6631 : :
6632 : : /* Check that the permutation is supported. */
6633 : 1436 : if (VECTOR_MODE_P (mode))
6634 : : {
6635 : 1436 : unsigned int i, j;
6636 : 1436 : if (count == 3)
6637 : : {
6638 : 695 : unsigned int nelt;
6639 : 1390 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6640 : : {
6641 : : if (dump_enabled_p ())
6642 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6643 : : "cannot handle groups of 3 loads for"
6644 : : " variable-length vectors\n");
6645 : : return false;
6646 : : }
6647 : :
6648 : 695 : vec_perm_builder sel (nelt, nelt, 1);
6649 : 695 : sel.quick_grow (nelt);
6650 : 695 : vec_perm_indices indices;
6651 : 695 : unsigned int k;
6652 : 2744 : for (k = 0; k < 3; k++)
6653 : : {
6654 : 7385 : for (i = 0; i < nelt; i++)
6655 : 5324 : if (3 * i + k < 2 * nelt)
6656 : 3555 : sel[i] = 3 * i + k;
6657 : : else
6658 : 1769 : sel[i] = 0;
6659 : 2061 : indices.new_vector (sel, 2, nelt);
6660 : 2061 : if (!can_vec_perm_const_p (mode, mode, indices))
6661 : : {
6662 : 12 : if (dump_enabled_p ())
6663 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6664 : : "shuffle of 3 loads is not supported by"
6665 : : " target\n");
6666 : 12 : return false;
6667 : : }
6668 : 7221 : for (i = 0, j = 0; i < nelt; i++)
6669 : 5172 : if (3 * i + k < 2 * nelt)
6670 : 3448 : sel[i] = i;
6671 : : else
6672 : 1724 : sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6673 : 2049 : indices.new_vector (sel, 2, nelt);
6674 : 2049 : if (!can_vec_perm_const_p (mode, mode, indices))
6675 : : {
6676 : 0 : if (dump_enabled_p ())
6677 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6678 : : "shuffle of 3 loads is not supported by"
6679 : : " target\n");
6680 : 0 : return false;
6681 : : }
6682 : : }
6683 : : return true;
6684 : 695 : }
6685 : : else
6686 : : {
6687 : : /* If length is not equal to 3 then only power of 2 is supported. */
6688 : 741 : gcc_assert (pow2p_hwi (count));
6689 : 1482 : poly_uint64 nelt = GET_MODE_NUNITS (mode);
6690 : :
6691 : : /* The encoding has a single stepped pattern. */
6692 : 741 : vec_perm_builder sel (nelt, 1, 3);
6693 : 741 : sel.quick_grow (3);
6694 : 3705 : for (i = 0; i < 3; i++)
6695 : 2223 : sel[i] = i * 2;
6696 : 741 : vec_perm_indices indices (sel, 2, nelt);
6697 : 741 : if (can_vec_perm_const_p (mode, mode, indices))
6698 : : {
6699 : 2956 : for (i = 0; i < 3; i++)
6700 : 2217 : sel[i] = i * 2 + 1;
6701 : 739 : indices.new_vector (sel, 2, nelt);
6702 : 739 : if (can_vec_perm_const_p (mode, mode, indices))
6703 : 739 : return true;
6704 : : }
6705 : 741 : }
6706 : : }
6707 : :
6708 : 2 : if (dump_enabled_p ())
6709 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6710 : : "extract even/odd not supported by target\n");
6711 : : return false;
6712 : : }
6713 : :
6714 : : /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6715 : : of type VECTYPE. MASKED_P says whether the masked form is needed.
6716 : : If it is available and ELSVALS is nonzero store the possible else values
6717 : : in the vector it points to. */
6718 : :
6719 : : internal_fn
6720 : 131356 : vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6721 : : bool masked_p, vec<int> *elsvals)
6722 : : {
6723 : 131356 : if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6724 : : vec_mask_len_load_lanes_optab, vectype,
6725 : : count, elsvals))
6726 : : return IFN_MASK_LEN_LOAD_LANES;
6727 : 131356 : else if (masked_p)
6728 : : {
6729 : 30 : if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6730 : : vec_mask_load_lanes_optab, vectype,
6731 : : count, elsvals))
6732 : : return IFN_MASK_LOAD_LANES;
6733 : : }
6734 : : else
6735 : : {
6736 : 131326 : if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6737 : : vectype, count, elsvals))
6738 : : return IFN_LOAD_LANES;
6739 : : }
6740 : : return IFN_LAST;
6741 : : }
6742 : :
6743 : : /* Function vect_force_dr_alignment_p.
6744 : :
6745 : : Returns whether the alignment of a DECL can be forced to be aligned
6746 : : on ALIGNMENT bit boundary. */
6747 : :
6748 : : bool
6749 : 659755 : vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6750 : : {
6751 : 659755 : if (!VAR_P (decl))
6752 : : return false;
6753 : :
6754 : 209046 : if (decl_in_symtab_p (decl)
6755 : 209046 : && (!symtab_node::get (decl)
6756 : 21663 : || !symtab_node::get (decl)->can_increase_alignment_p ()))
6757 : 12920 : return false;
6758 : :
6759 : 196126 : if (TREE_STATIC (decl))
6760 : 8743 : return (known_le (alignment,
6761 : 8743 : (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6762 : : else
6763 : 187383 : return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6764 : : }
6765 : :
6766 : : /* Return whether the data reference DR_INFO is supported with respect to its
6767 : : alignment.
6768 : : If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6769 : : it is aligned, i.e., check if it is possible to vectorize it with different
6770 : : alignment. If IS_GATHER_SCATTER is true we are dealing with a
6771 : : gather/scatter. */
6772 : :
6773 : : enum dr_alignment_support
6774 : 2430714 : vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
6775 : : tree vectype, int misalignment,
6776 : : bool is_gather_scatter)
6777 : : {
6778 : 2430714 : data_reference *dr = dr_info->dr;
6779 : 2430714 : stmt_vec_info stmt_info = dr_info->stmt;
6780 : 2430714 : machine_mode mode = TYPE_MODE (vectype);
6781 : 2430714 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6782 : 2430714 : class loop *vect_loop = NULL;
6783 : 2430714 : bool nested_in_vect_loop = false;
6784 : :
6785 : 2430714 : if (misalignment == 0)
6786 : : return dr_aligned;
6787 : 1494517 : else if (dr_safe_speculative_read_required (stmt_info))
6788 : : return dr_unaligned_unsupported;
6789 : :
6790 : 1091200 : if (loop_vinfo)
6791 : : {
6792 : 682514 : vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6793 : 682514 : nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6794 : : }
6795 : :
6796 : : /* Possibly unaligned access. */
6797 : :
6798 : : /* We can choose between using the implicit realignment scheme (generating
6799 : : a misaligned_move stmt) and the explicit realignment scheme (generating
6800 : : aligned loads with a REALIGN_LOAD). There are two variants to the
6801 : : explicit realignment scheme: optimized, and unoptimized.
6802 : : We can optimize the realignment only if the step between consecutive
6803 : : vector loads is equal to the vector size. Since the vector memory
6804 : : accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6805 : : is guaranteed that the misalignment amount remains the same throughout the
6806 : : execution of the vectorized loop. Therefore, we can create the
6807 : : "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6808 : : at the loop preheader.
6809 : :
6810 : : However, in the case of outer-loop vectorization, when vectorizing a
6811 : : memory access in the inner-loop nested within the LOOP that is now being
6812 : : vectorized, while it is guaranteed that the misalignment of the
6813 : : vectorized memory access will remain the same in different outer-loop
6814 : : iterations, it is *not* guaranteed that is will remain the same throughout
6815 : : the execution of the inner-loop. This is because the inner-loop advances
6816 : : with the original scalar step (and not in steps of VS). If the inner-loop
6817 : : step happens to be a multiple of VS, then the misalignment remains fixed
6818 : : and we can use the optimized realignment scheme. For example:
6819 : :
6820 : : for (i=0; i<N; i++)
6821 : : for (j=0; j<M; j++)
6822 : : s += a[i+j];
6823 : :
6824 : : When vectorizing the i-loop in the above example, the step between
6825 : : consecutive vector loads is 1, and so the misalignment does not remain
6826 : : fixed across the execution of the inner-loop, and the realignment cannot
6827 : : be optimized (as illustrated in the following pseudo vectorized loop):
6828 : :
6829 : : for (i=0; i<N; i+=4)
6830 : : for (j=0; j<M; j++){
6831 : : vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6832 : : // when j is {0,1,2,3,4,5,6,7,...} respectively.
6833 : : // (assuming that we start from an aligned address).
6834 : : }
6835 : :
6836 : : We therefore have to use the unoptimized realignment scheme:
6837 : :
6838 : : for (i=0; i<N; i+=4)
6839 : : for (j=k; j<M; j+=4)
6840 : : vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6841 : : // that the misalignment of the initial address is
6842 : : // 0).
6843 : :
6844 : : The loop can then be vectorized as follows:
6845 : :
6846 : : for (k=0; k<4; k++){
6847 : : rt = get_realignment_token (&vp[k]);
6848 : : for (i=0; i<N; i+=4){
6849 : : v1 = vp[i+k];
6850 : : for (j=k; j<M; j+=4){
6851 : : v2 = vp[i+j+VS-1];
6852 : : va = REALIGN_LOAD <v1,v2,rt>;
6853 : : vs += va;
6854 : : v1 = v2;
6855 : : }
6856 : : }
6857 : : } */
6858 : :
6859 : 1091200 : if (DR_IS_READ (dr) && !is_gather_scatter)
6860 : : {
6861 : 445040 : if (can_implement_p (vec_realign_load_optab, mode)
6862 : 445040 : && (!targetm.vectorize.builtin_mask_for_load
6863 : 0 : || targetm.vectorize.builtin_mask_for_load ()))
6864 : : {
6865 : : /* If we are doing SLP then the accesses need not have the
6866 : : same alignment, instead it depends on the SLP group size. */
6867 : 0 : if (loop_vinfo
6868 : 0 : && STMT_SLP_TYPE (stmt_info)
6869 : 0 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
6870 : 0 : && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6871 : 0 : * (DR_GROUP_SIZE
6872 : 0 : (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6873 : 0 : TYPE_VECTOR_SUBPARTS (vectype)))
6874 : : ;
6875 : 0 : else if (!loop_vinfo
6876 : 0 : || (nested_in_vect_loop
6877 : 0 : && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6878 : 0 : GET_MODE_SIZE (TYPE_MODE (vectype)))))
6879 : 0 : return dr_explicit_realign;
6880 : : else
6881 : 0 : return dr_explicit_realign_optimized;
6882 : : }
6883 : : }
6884 : :
6885 : 1091200 : bool is_packed = not_size_aligned (DR_REF (dr));
6886 : 1091200 : if (misalignment == DR_MISALIGNMENT_UNKNOWN
6887 : 1091200 : && is_gather_scatter)
6888 : 3044 : misalignment = (get_object_alignment (DR_REF (dr))
6889 : 3044 : % (GET_MODE_BITSIZE (GET_MODE_INNER (mode))))
6890 : 3044 : / BITS_PER_UNIT;
6891 : 1091200 : if (targetm.vectorize.support_vector_misalignment (mode, misalignment,
6892 : : is_packed,
6893 : : is_gather_scatter))
6894 : : return dr_unaligned_supported;
6895 : :
6896 : : /* Unsupported. */
6897 : : return dr_unaligned_unsupported;
6898 : : }
|