Branch data Line data Source code
1 : : /* Data References Analysis and Manipulation Utilities for Vectorization.
2 : : Copyright (C) 2003-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : : and Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #include "config.h"
23 : : #include "system.h"
24 : : #include "coretypes.h"
25 : : #include "backend.h"
26 : : #include "target.h"
27 : : #include "rtl.h"
28 : : #include "tree.h"
29 : : #include "gimple.h"
30 : : #include "predict.h"
31 : : #include "memmodel.h"
32 : : #include "tm_p.h"
33 : : #include "ssa.h"
34 : : #include "optabs-tree.h"
35 : : #include "cgraph.h"
36 : : #include "dumpfile.h"
37 : : #include "pretty-print.h"
38 : : #include "alias.h"
39 : : #include "fold-const.h"
40 : : #include "stor-layout.h"
41 : : #include "tree-eh.h"
42 : : #include "gimplify.h"
43 : : #include "gimple-iterator.h"
44 : : #include "gimplify-me.h"
45 : : #include "tree-ssa-loop-ivopts.h"
46 : : #include "tree-ssa-loop-manip.h"
47 : : #include "tree-ssa-loop.h"
48 : : #include "cfgloop.h"
49 : : #include "tree-scalar-evolution.h"
50 : : #include "tree-vectorizer.h"
51 : : #include "expr.h"
52 : : #include "builtins.h"
53 : : #include "tree-cfg.h"
54 : : #include "tree-hash-traits.h"
55 : : #include "vec-perm-indices.h"
56 : : #include "internal-fn.h"
57 : : #include "gimple-fold.h"
58 : : #include "optabs-query.h"
59 : :
60 : : /* Return true if load- or store-lanes optab OPTAB is implemented for
61 : : COUNT vectors of type VECTYPE. NAME is the name of OPTAB.
62 : :
63 : : If it is implemented and ELSVALS is nonzero store the possible else
64 : : values in the vector it points to. */
65 : :
66 : : static bool
67 : 230550 : vect_lanes_optab_supported_p (const char *name, convert_optab optab,
68 : : tree vectype, unsigned HOST_WIDE_INT count,
69 : : vec<int> *elsvals = nullptr)
70 : : {
71 : 230550 : machine_mode mode, array_mode;
72 : 230550 : bool limit_p;
73 : :
74 : 230550 : mode = TYPE_MODE (vectype);
75 : 230550 : if (!targetm.array_mode (mode, count).exists (&array_mode))
76 : : {
77 : 461100 : poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
78 : 230550 : limit_p = !targetm.array_mode_supported_p (mode, count);
79 : 230550 : if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
80 : : {
81 : 211910 : if (dump_enabled_p ())
82 : 13080 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
83 : : "no array mode for %s[%wu]\n",
84 : 13080 : GET_MODE_NAME (mode), count);
85 : 211910 : return false;
86 : : }
87 : : }
88 : :
89 : 18640 : enum insn_code icode;
90 : 18640 : if ((icode = convert_optab_handler (optab, array_mode, mode))
91 : : == CODE_FOR_nothing)
92 : : {
93 : 18640 : if (dump_enabled_p ())
94 : 2502 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
95 : : "cannot use %s<%s><%s>\n", name,
96 : 2502 : GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
97 : 18640 : return false;
98 : : }
99 : :
100 : 0 : if (dump_enabled_p ())
101 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
102 : 0 : "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
103 : 0 : GET_MODE_NAME (mode));
104 : :
105 : 0 : if (elsvals)
106 : 0 : get_supported_else_vals (icode,
107 : 0 : internal_fn_else_index (IFN_MASK_LEN_LOAD_LANES),
108 : : *elsvals);
109 : :
110 : : return true;
111 : : }
112 : :
113 : : /* Helper function to identify a simd clone call. If this is a call to a
114 : : function with simd clones then return the corresponding cgraph_node,
115 : : otherwise return NULL. */
116 : :
117 : : static cgraph_node*
118 : 718951 : simd_clone_call_p (gimple *stmt)
119 : : {
120 : 798567 : gcall *call = dyn_cast <gcall *> (stmt);
121 : 81953 : if (!call)
122 : : return NULL;
123 : :
124 : 81953 : tree fndecl = NULL_TREE;
125 : 81953 : if (gimple_call_internal_p (call, IFN_MASK_CALL))
126 : 349 : fndecl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
127 : : else
128 : 81604 : fndecl = gimple_call_fndecl (stmt);
129 : :
130 : 81953 : if (fndecl == NULL_TREE)
131 : : return NULL;
132 : :
133 : 35549 : cgraph_node *node = cgraph_node::get (fndecl);
134 : 35549 : if (node && node->simd_clones != NULL)
135 : : return node;
136 : :
137 : : return NULL;
138 : : }
139 : :
140 : :
141 : :
142 : : /* Return the smallest scalar part of STMT_INFO.
143 : : This is used to determine the vectype of the stmt. We generally set the
144 : : vectype according to the type of the result (lhs). For stmts whose
145 : : result-type is different than the type of the arguments (e.g., demotion,
146 : : promotion), vectype will be reset appropriately (later). Note that we have
147 : : to visit the smallest datatype in this function, because that determines the
148 : : VF. If the smallest datatype in the loop is present only as the rhs of a
149 : : promotion operation - we'd miss it.
150 : : Such a case, where a variable of this datatype does not appear in the lhs
151 : : anywhere in the loop, can only occur if it's an invariant: e.g.:
152 : : 'int_x = (int) short_inv', which we'd expect to have been optimized away by
153 : : invariant motion. However, we cannot rely on invariant motion to always
154 : : take invariants out of the loop, and so in the case of promotion we also
155 : : have to check the rhs.
156 : : LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
157 : : types. */
158 : :
159 : : tree
160 : 5878527 : vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
161 : : {
162 : 5878527 : HOST_WIDE_INT lhs, rhs;
163 : :
164 : : /* During the analysis phase, this function is called on arbitrary
165 : : statements that might not have scalar results. */
166 : 5878527 : if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
167 : : return scalar_type;
168 : :
169 : 5878527 : lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
170 : :
171 : 5878527 : gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
172 : 5878527 : if (assign)
173 : : {
174 : 5159576 : scalar_type = TREE_TYPE (gimple_assign_lhs (assign));
175 : 5159576 : if (gimple_assign_cast_p (assign)
176 : 4726844 : || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
177 : 4726074 : || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
178 : 4726074 : || gimple_assign_rhs_code (assign) == SAD_EXPR
179 : 4725732 : || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
180 : 4720930 : || gimple_assign_rhs_code (assign) == WIDEN_MULT_PLUS_EXPR
181 : 4720930 : || gimple_assign_rhs_code (assign) == WIDEN_MULT_MINUS_EXPR
182 : 4720930 : || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
183 : 9880506 : || gimple_assign_rhs_code (assign) == FLOAT_EXPR)
184 : : {
185 : 458921 : tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
186 : :
187 : 458921 : rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
188 : 458921 : if (rhs < lhs)
189 : 5878527 : scalar_type = rhs_type;
190 : : }
191 : : }
192 : 718951 : else if (cgraph_node *node = simd_clone_call_p (stmt_info->stmt))
193 : : {
194 : 2337 : auto clone = node->simd_clones->simdclone;
195 : 7528 : for (unsigned int i = 0; i < clone->nargs; ++i)
196 : : {
197 : 5191 : if (clone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
198 : : {
199 : 3028 : tree arg_scalar_type = TREE_TYPE (clone->args[i].vector_type);
200 : 3028 : rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (arg_scalar_type));
201 : 3028 : if (rhs < lhs)
202 : : {
203 : 5191 : scalar_type = arg_scalar_type;
204 : 5191 : lhs = rhs;
205 : : }
206 : : }
207 : : }
208 : : }
209 : 716614 : else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
210 : : {
211 : 79616 : unsigned int i = 0;
212 : 79616 : if (gimple_call_internal_p (call))
213 : : {
214 : 43157 : internal_fn ifn = gimple_call_internal_fn (call);
215 : 43157 : if (internal_load_fn_p (ifn))
216 : : /* For loads the LHS type does the trick. */
217 : : i = ~0U;
218 : 36874 : else if (internal_store_fn_p (ifn))
219 : : {
220 : : /* For stores use the tyep of the stored value. */
221 : 3283 : i = internal_fn_stored_value_index (ifn);
222 : 3283 : scalar_type = TREE_TYPE (gimple_call_arg (call, i));
223 : 3283 : i = ~0U;
224 : : }
225 : 33591 : else if (internal_fn_mask_index (ifn) == 0)
226 : 8820 : i = 1;
227 : : }
228 : 79616 : if (i < gimple_call_num_args (call))
229 : : {
230 : 65559 : tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
231 : 65559 : if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
232 : : {
233 : 65559 : rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
234 : 65559 : if (rhs < lhs)
235 : 5878527 : scalar_type = rhs_type;
236 : : }
237 : : }
238 : : }
239 : :
240 : : return scalar_type;
241 : : }
242 : :
243 : :
244 : : /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
245 : : tested at run-time. Return TRUE if DDR was successfully inserted.
246 : : Return false if versioning is not supported. */
247 : :
248 : : static opt_result
249 : 72137 : vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
250 : : {
251 : 72137 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
252 : :
253 : 72137 : if ((unsigned) param_vect_max_version_for_alias_checks == 0)
254 : 54 : return opt_result::failure_at (vect_location,
255 : : "will not create alias checks, as"
256 : : " --param vect-max-version-for-alias-checks"
257 : : " == 0\n");
258 : :
259 : 72083 : opt_result res
260 : 72083 : = runtime_alias_check_p (ddr, loop,
261 : 72083 : optimize_loop_nest_for_speed_p (loop));
262 : 72083 : if (!res)
263 : 128 : return res;
264 : :
265 : 71955 : LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
266 : 71955 : return opt_result::success ();
267 : : }
268 : :
269 : : /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */
270 : :
271 : : static void
272 : 1201 : vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
273 : : {
274 : 1201 : const vec<tree> &checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
275 : 1852 : for (unsigned int i = 0; i < checks.length(); ++i)
276 : 657 : if (checks[i] == value)
277 : : return;
278 : :
279 : 1195 : if (dump_enabled_p ())
280 : 425 : dump_printf_loc (MSG_NOTE, vect_location,
281 : : "need run-time check that %T is nonzero\n",
282 : : value);
283 : 1195 : LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
284 : : }
285 : :
286 : : /* Return true if we know that the order of vectorized DR_INFO_A and
287 : : vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
288 : : DR_INFO_B. At least one of the accesses is a write. */
289 : :
290 : : static bool
291 : 101119 : vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
292 : : {
293 : 101119 : stmt_vec_info stmtinfo_a = dr_info_a->stmt;
294 : 101119 : stmt_vec_info stmtinfo_b = dr_info_b->stmt;
295 : :
296 : : /* Single statements are always kept in their original order. */
297 : 101119 : if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
298 : 162149 : && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
299 : : return true;
300 : :
301 : : /* If there is a loop invariant read involved we might vectorize it in
302 : : the prologue, breaking scalar oder with respect to the in-loop store. */
303 : 20607 : if ((DR_IS_READ (dr_info_a->dr) && integer_zerop (DR_STEP (dr_info_a->dr)))
304 : 62309 : || (DR_IS_READ (dr_info_b->dr) && integer_zerop (DR_STEP (dr_info_b->dr))))
305 : 1305 : return false;
306 : :
307 : : /* STMT_A and STMT_B belong to overlapping groups. All loads are
308 : : emitted at the position of the first scalar load.
309 : : Stores in a group are emitted at the position of the last scalar store.
310 : : Compute that position and check whether the resulting order matches
311 : : the current one. */
312 : 41390 : stmt_vec_info il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
313 : 41390 : if (il_a)
314 : : {
315 : 39777 : if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
316 : 158726 : for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
317 : 137997 : s = DR_GROUP_NEXT_ELEMENT (s))
318 : 137997 : il_a = get_later_stmt (il_a, s);
319 : : else /* DR_IS_READ */
320 : 78848 : for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
321 : 59800 : s = DR_GROUP_NEXT_ELEMENT (s))
322 : 59800 : if (get_later_stmt (il_a, s) == il_a)
323 : 1850 : il_a = s;
324 : : }
325 : : else
326 : : il_a = stmtinfo_a;
327 : 41390 : stmt_vec_info il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
328 : 41390 : if (il_b)
329 : : {
330 : 37541 : if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
331 : 201291 : for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
332 : 172173 : s = DR_GROUP_NEXT_ELEMENT (s))
333 : 172173 : il_b = get_later_stmt (il_b, s);
334 : : else /* DR_IS_READ */
335 : 39111 : for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
336 : 30688 : s = DR_GROUP_NEXT_ELEMENT (s))
337 : 30688 : if (get_later_stmt (il_b, s) == il_b)
338 : 207 : il_b = s;
339 : : }
340 : : else
341 : : il_b = stmtinfo_b;
342 : 41390 : bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
343 : 41390 : return (get_later_stmt (il_a, il_b) == il_a) == a_after_b;
344 : : }
345 : :
346 : : /* A subroutine of vect_analyze_data_ref_dependence. Handle
347 : : DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
348 : : distances. These distances are conservatively correct but they don't
349 : : reflect a guaranteed dependence.
350 : :
351 : : Return true if this function does all the work necessary to avoid
352 : : an alias or false if the caller should use the dependence distances
353 : : to limit the vectorization factor in the usual way. LOOP_DEPTH is
354 : : the depth of the loop described by LOOP_VINFO and the other arguments
355 : : are as for vect_analyze_data_ref_dependence. */
356 : :
357 : : static bool
358 : 7142 : vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
359 : : loop_vec_info loop_vinfo,
360 : : int loop_depth, unsigned int *max_vf)
361 : : {
362 : 7142 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
363 : 28586 : for (lambda_vector &dist_v : DDR_DIST_VECTS (ddr))
364 : : {
365 : 14134 : int dist = dist_v[loop_depth];
366 : 14134 : if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
367 : : {
368 : : /* If the user asserted safelen >= DIST consecutive iterations
369 : : can be executed concurrently, assume independence.
370 : :
371 : : ??? An alternative would be to add the alias check even
372 : : in this case, and vectorize the fallback loop with the
373 : : maximum VF set to safelen. However, if the user has
374 : : explicitly given a length, it's less likely that that
375 : : would be a win. */
376 : 7006 : if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
377 : : {
378 : 32 : if ((unsigned int) loop->safelen < *max_vf)
379 : 2 : *max_vf = loop->safelen;
380 : 32 : LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
381 : 32 : continue;
382 : : }
383 : :
384 : : /* For dependence distances of 2 or more, we have the option
385 : : of limiting VF or checking for an alias at runtime.
386 : : Prefer to check at runtime if we can, to avoid limiting
387 : : the VF unnecessarily when the bases are in fact independent.
388 : :
389 : : Note that the alias checks will be removed if the VF ends up
390 : : being small enough. */
391 : 6974 : dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
392 : 6974 : dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
393 : 6974 : return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
394 : 6974 : && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
395 : 13956 : && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
396 : : }
397 : : }
398 : : return true;
399 : : }
400 : :
401 : :
402 : : /* Function vect_analyze_data_ref_dependence.
403 : :
404 : : FIXME: I needed to change the sense of the returned flag.
405 : :
406 : : Return FALSE if there (might) exist a dependence between a memory-reference
407 : : DRA and a memory-reference DRB. When versioning for alias may check a
408 : : dependence at run-time, return TRUE. Adjust *MAX_VF according to
409 : : the data dependence. */
410 : :
411 : : static opt_result
412 : 1771302 : vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
413 : : loop_vec_info loop_vinfo,
414 : : unsigned int *max_vf)
415 : : {
416 : 1771302 : unsigned int i;
417 : 1771302 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
418 : 1771302 : struct data_reference *dra = DDR_A (ddr);
419 : 1771302 : struct data_reference *drb = DDR_B (ddr);
420 : 1771302 : dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
421 : 1771302 : dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
422 : 1771302 : stmt_vec_info stmtinfo_a = dr_info_a->stmt;
423 : 1771302 : stmt_vec_info stmtinfo_b = dr_info_b->stmt;
424 : 1771302 : lambda_vector dist_v;
425 : 1771302 : unsigned int loop_depth;
426 : :
427 : : /* If user asserted safelen consecutive iterations can be
428 : : executed concurrently, assume independence. */
429 : 1851567 : auto apply_safelen = [&]()
430 : : {
431 : 80265 : if (loop->safelen >= 2)
432 : : {
433 : 7454 : if ((unsigned int) loop->safelen < *max_vf)
434 : 1895 : *max_vf = loop->safelen;
435 : 7454 : LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
436 : 7454 : return true;
437 : : }
438 : : return false;
439 : 1771302 : };
440 : :
441 : : /* In loop analysis all data references should be vectorizable. */
442 : 1771302 : if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
443 : 1771302 : || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
444 : 0 : gcc_unreachable ();
445 : :
446 : : /* Independent data accesses. */
447 : 1771302 : if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
448 : 1633359 : return opt_result::success ();
449 : :
450 : 137943 : if (dra == drb
451 : 137943 : || (DR_IS_READ (dra) && DR_IS_READ (drb)))
452 : 0 : return opt_result::success ();
453 : :
454 : : /* We do not have to consider dependences between accesses that belong
455 : : to the same group, unless the stride could be smaller than the
456 : : group size. */
457 : 137943 : if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
458 : 47955 : && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
459 : 47955 : == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
460 : 142774 : && !STMT_VINFO_STRIDED_P (stmtinfo_a))
461 : 137 : return opt_result::success ();
462 : :
463 : : /* Even if we have an anti-dependence then, as the vectorized loop covers at
464 : : least two scalar iterations, there is always also a true dependence.
465 : : As the vectorizer does not re-order loads and stores we can ignore
466 : : the anti-dependence if TBAA can disambiguate both DRs similar to the
467 : : case with known negative distance anti-dependences (positive
468 : : distance anti-dependences would violate TBAA constraints). */
469 : 90292 : if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
470 : 47514 : || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
471 : 242135 : && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
472 : : get_alias_set (DR_REF (drb))))
473 : 3982 : return opt_result::success ();
474 : :
475 : 133824 : if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
476 : 126589 : || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
477 : : {
478 : 9038 : if (apply_safelen ())
479 : 1390 : return opt_result::success ();
480 : :
481 : 7648 : return opt_result::failure_at
482 : 7648 : (stmtinfo_a->stmt,
483 : : "possible alias involving gather/scatter between %T and %T\n",
484 : : DR_REF (dra), DR_REF (drb));
485 : : }
486 : :
487 : : /* Unknown data dependence. */
488 : 124786 : if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
489 : : {
490 : 70728 : if (apply_safelen ())
491 : 6064 : return opt_result::success ();
492 : :
493 : 64664 : if (dump_enabled_p ())
494 : 7006 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
495 : : "versioning for alias required: "
496 : : "can't determine dependence between %T and %T\n",
497 : : DR_REF (dra), DR_REF (drb));
498 : :
499 : : /* Add to list of ddrs that need to be tested at run-time. */
500 : 64664 : return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
501 : : }
502 : :
503 : : /* Known data dependence. */
504 : 54058 : if (DDR_NUM_DIST_VECTS (ddr) == 0)
505 : : {
506 : 499 : if (apply_safelen ())
507 : 0 : return opt_result::success ();
508 : :
509 : 499 : if (dump_enabled_p ())
510 : 114 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
511 : : "versioning for alias required: "
512 : : "bad dist vector for %T and %T\n",
513 : : DR_REF (dra), DR_REF (drb));
514 : : /* Add to list of ddrs that need to be tested at run-time. */
515 : 499 : return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
516 : : }
517 : :
518 : 53559 : loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
519 : :
520 : 53559 : if (DDR_COULD_BE_INDEPENDENT_P (ddr)
521 : 53559 : && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
522 : : loop_depth, max_vf))
523 : 7134 : return opt_result::success ();
524 : :
525 : 87075 : FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
526 : : {
527 : 46445 : int dist = dist_v[loop_depth];
528 : :
529 : 46445 : if (dump_enabled_p ())
530 : 3596 : dump_printf_loc (MSG_NOTE, vect_location,
531 : : "dependence distance = %d.\n", dist);
532 : :
533 : 46445 : if (dist == 0)
534 : : {
535 : 37744 : if (dump_enabled_p ())
536 : 3188 : dump_printf_loc (MSG_NOTE, vect_location,
537 : : "dependence distance == 0 between %T and %T\n",
538 : : DR_REF (dra), DR_REF (drb));
539 : :
540 : : /* When we perform grouped accesses and perform implicit CSE
541 : : by detecting equal accesses and doing disambiguation with
542 : : runtime alias tests like for
543 : : .. = a[i];
544 : : .. = a[i+1];
545 : : a[i] = ..;
546 : : a[i+1] = ..;
547 : : *p = ..;
548 : : .. = a[i];
549 : : .. = a[i+1];
550 : : where we will end up loading { a[i], a[i+1] } once, make
551 : : sure that inserting group loads before the first load and
552 : : stores after the last store will do the right thing.
553 : : Similar for groups like
554 : : a[i] = ...;
555 : : ... = a[i];
556 : : a[i+1] = ...;
557 : : where loads from the group interleave with the store. */
558 : 37744 : if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
559 : 0 : return opt_result::failure_at (stmtinfo_a->stmt,
560 : : "READ_WRITE dependence"
561 : : " in interleaving.\n");
562 : :
563 : 37744 : if (loop->safelen < 2)
564 : : {
565 : 34005 : tree indicator = dr_zero_step_indicator (dra);
566 : 34005 : if (!indicator || integer_zerop (indicator))
567 : 0 : return opt_result::failure_at (stmtinfo_a->stmt,
568 : : "access also has a zero step\n");
569 : 34005 : else if (TREE_CODE (indicator) != INTEGER_CST)
570 : 1201 : vect_check_nonzero_value (loop_vinfo, indicator);
571 : : }
572 : 37744 : continue;
573 : 37744 : }
574 : :
575 : 8701 : if (dist > 0 && DDR_REVERSED_P (ddr))
576 : : {
577 : : /* If DDR_REVERSED_P the order of the data-refs in DDR was
578 : : reversed (to make distance vector positive), and the actual
579 : : distance is negative. */
580 : 2505 : if (dump_enabled_p ())
581 : 105 : dump_printf_loc (MSG_NOTE, vect_location,
582 : : "dependence distance negative.\n");
583 : : /* When doing outer loop vectorization, we need to check if there is
584 : : a backward dependence at the inner loop level if the dependence
585 : : at the outer loop is reversed. See PR81740. */
586 : 2505 : if (nested_in_vect_loop_p (loop, stmtinfo_a)
587 : 2493 : || nested_in_vect_loop_p (loop, stmtinfo_b))
588 : : {
589 : 12 : unsigned inner_depth = index_in_loop_nest (loop->inner->num,
590 : 12 : DDR_LOOP_NEST (ddr));
591 : 12 : if (dist_v[inner_depth] < 0)
592 : 9 : return opt_result::failure_at (stmtinfo_a->stmt,
593 : : "not vectorized, dependence "
594 : : "between data-refs %T and %T\n",
595 : : DR_REF (dra), DR_REF (drb));
596 : : }
597 : : /* Record a negative dependence distance to later limit the
598 : : amount of stmt copying / unrolling we can perform.
599 : : Only need to handle read-after-write dependence. */
600 : 2496 : if (DR_IS_READ (drb)
601 : 76 : && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
602 : 12 : || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
603 : 76 : STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
604 : 2496 : continue;
605 : 2496 : }
606 : :
607 : 6196 : unsigned int abs_dist = abs (dist);
608 : 6196 : if (abs_dist >= 2 && abs_dist < *max_vf)
609 : : {
610 : : /* The dependence distance requires reduction of the maximal
611 : : vectorization factor. */
612 : 301 : *max_vf = abs_dist;
613 : 301 : if (dump_enabled_p ())
614 : 30 : dump_printf_loc (MSG_NOTE, vect_location,
615 : : "adjusting maximal vectorization factor to %i\n",
616 : : *max_vf);
617 : : }
618 : :
619 : 6196 : if (abs_dist >= *max_vf)
620 : : {
621 : : /* Dependence distance does not create dependence, as far as
622 : : vectorization is concerned, in this case. */
623 : 410 : if (dump_enabled_p ())
624 : 57 : dump_printf_loc (MSG_NOTE, vect_location,
625 : : "dependence distance >= VF.\n");
626 : 410 : continue;
627 : : }
628 : :
629 : 5786 : return opt_result::failure_at (stmtinfo_a->stmt,
630 : : "not vectorized, possible dependence "
631 : : "between data-refs %T and %T\n",
632 : : DR_REF (dra), DR_REF (drb));
633 : : }
634 : :
635 : 40630 : return opt_result::success ();
636 : : }
637 : :
638 : : /* Function vect_analyze_early_break_dependences.
639 : :
640 : : Examine all the data references in the loop and make sure that if we have
641 : : multiple exits that we are able to safely move stores such that they become
642 : : safe for vectorization. The function also calculates the place where to move
643 : : the instructions to and computes what the new vUSE chain should be.
644 : :
645 : : This works in tandem with the CFG that will be produced by
646 : : slpeel_tree_duplicate_loop_to_edge_cfg later on.
647 : :
648 : : This function tries to validate whether an early break vectorization
649 : : is possible for the current instruction sequence. Returns True i
650 : : possible, otherwise False.
651 : :
652 : : Requirements:
653 : : - Any memory access must be to a fixed size buffer.
654 : : - There must not be any loads and stores to the same object.
655 : : - Multiple loads are allowed as long as they don't alias.
656 : :
657 : : NOTE:
658 : : This implementation is very conservative. Any overlapping loads/stores
659 : : that take place before the early break statement gets rejected aside from
660 : : WAR dependencies.
661 : :
662 : : i.e.:
663 : :
664 : : a[i] = 8
665 : : c = a[i]
666 : : if (b[i])
667 : : ...
668 : :
669 : : is not allowed, but
670 : :
671 : : c = a[i]
672 : : a[i] = 8
673 : : if (b[i])
674 : : ...
675 : :
676 : : is which is the common case. */
677 : :
678 : : static opt_result
679 : 102441 : vect_analyze_early_break_dependences (loop_vec_info loop_vinfo)
680 : : {
681 : 102441 : DUMP_VECT_SCOPE ("vect_analyze_early_break_dependences");
682 : :
683 : : /* List of all load data references found during traversal. */
684 : 102441 : auto_vec<data_reference *> bases;
685 : 102441 : basic_block dest_bb = NULL;
686 : :
687 : 102441 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
688 : 102441 : class loop *loop_nest = loop_outer (loop);
689 : :
690 : 102441 : if (dump_enabled_p ())
691 : 1178 : dump_printf_loc (MSG_NOTE, vect_location,
692 : : "loop contains multiple exits, analyzing"
693 : : " statement dependencies.\n");
694 : :
695 : 102441 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
696 : 4368 : if (dump_enabled_p ())
697 : 159 : dump_printf_loc (MSG_NOTE, vect_location,
698 : : "alternate exit has been chosen as main exit.\n");
699 : :
700 : : /* Since we don't support general control flow, the location we'll move the
701 : : side-effects to is always the latch connected exit. When we support
702 : : general control flow we can do better but for now this is fine. Move
703 : : side-effects to the in-loop destination of the last early exit. For the
704 : : PEELED case we move the side-effects to the latch block as this is
705 : : guaranteed to be the last block to be executed when a vector iteration
706 : : finished. */
707 : 102441 : if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
708 : 4368 : dest_bb = loop->latch;
709 : : else
710 : 98073 : dest_bb = single_pred (loop->latch);
711 : :
712 : : /* We start looking from dest_bb, for the non-PEELED case we don't want to
713 : : move any stores already present, but we do want to read and validate the
714 : : loads. */
715 : 102441 : basic_block bb = dest_bb;
716 : :
717 : : /* We move stores across all loads to the beginning of dest_bb, so
718 : : the first block processed below doesn't need dependence checking. */
719 : 102441 : bool check_deps = false;
720 : :
721 : 378893 : do
722 : : {
723 : 240667 : gimple_stmt_iterator gsi = gsi_last_bb (bb);
724 : :
725 : : /* Now analyze all the remaining statements and try to determine which
726 : : instructions are allowed/needed to be moved. */
727 : 1676405 : while (!gsi_end_p (gsi))
728 : : {
729 : 1442328 : gimple *stmt = gsi_stmt (gsi);
730 : 1442328 : gsi_prev (&gsi);
731 : 1442328 : if (is_gimple_debug (stmt))
732 : 1249304 : continue;
733 : :
734 : 839970 : stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
735 : 839970 : auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
736 : 839970 : if (!dr_ref)
737 : 638681 : continue;
738 : :
739 : : /* We know everything below dest_bb is safe since we know we
740 : : had a full vector iteration when reaching it. Either by
741 : : the loop entry / IV exit test being last or because this
742 : : is the loop latch itself. */
743 : 201289 : if (!check_deps)
744 : 8265 : continue;
745 : :
746 : : /* Check if vector accesses to the object will be within bounds.
747 : : must be a constant or assume loop will be versioned or niters
748 : : bounded by VF so accesses are within range. We only need to check
749 : : the reads since writes are moved to a safe place where if we get
750 : : there we know they are safe to perform. */
751 : 193024 : if (DR_IS_READ (dr_ref)
752 : 193024 : && !ref_within_array_bound (stmt, DR_REF (dr_ref)))
753 : : {
754 : 114822 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo)
755 : 112842 : || STMT_VINFO_STRIDED_P (stmt_vinfo))
756 : : {
757 : 6563 : const char *msg
758 : : = "early break not supported: cannot peel "
759 : : "for alignment, vectorization would read out of "
760 : : "bounds at %G";
761 : 6563 : return opt_result::failure_at (stmt, msg, stmt);
762 : : }
763 : :
764 : 108259 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_vinfo);
765 : 108259 : dr_info->need_peeling_for_alignment = true;
766 : :
767 : 108259 : if (dump_enabled_p ())
768 : 233 : dump_printf_loc (MSG_NOTE, vect_location,
769 : : "marking DR (read) as needing peeling for "
770 : : "alignment at %G", stmt);
771 : : }
772 : :
773 : 186461 : if (DR_IS_READ (dr_ref))
774 : 173466 : bases.safe_push (dr_ref);
775 : 12995 : else if (DR_IS_WRITE (dr_ref))
776 : : {
777 : : /* We are moving writes down in the CFG. To be sure that this
778 : : is valid after vectorization we have to check all the loads
779 : : we are sinking the stores past to see if any of them may
780 : : alias or are the same object.
781 : :
782 : : Same objects will not be an issue because unless the store
783 : : is marked volatile the value can be forwarded. If the
784 : : store is marked volatile we don't vectorize the loop
785 : : anyway.
786 : :
787 : : That leaves the check for aliasing. We don't really need
788 : : to care about the stores aliasing with each other since the
789 : : stores are moved in order so the effects are still observed
790 : : correctly. This leaves the check for WAR dependencies
791 : : which we would be introducing here if the DR can alias.
792 : : The check is quadratic in loads/stores but I have not found
793 : : a better API to do this. I believe all loads and stores
794 : : must be checked. We also must check them when we
795 : : encountered the store, since we don't care about loads past
796 : : the store. */
797 : :
798 : 43748 : for (auto dr_read : bases)
799 : 14360 : if (dr_may_alias_p (dr_ref, dr_read, loop_nest))
800 : : {
801 : 27 : if (dump_enabled_p ())
802 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
803 : : vect_location,
804 : : "early breaks not supported: "
805 : : "overlapping loads and stores "
806 : : "found before the break "
807 : : "statement.\n");
808 : :
809 : 27 : return opt_result::failure_at (stmt,
810 : : "can't safely apply code motion to dependencies"
811 : : " to vectorize the early exit. %G may alias with"
812 : : " %G\n", stmt, dr_read->stmt);
813 : : }
814 : : }
815 : :
816 : 372868 : if (gimple_vdef (stmt))
817 : : {
818 : 12968 : if (dump_enabled_p ())
819 : 198 : dump_printf_loc (MSG_NOTE, vect_location,
820 : : "==> recording stmt %G", stmt);
821 : :
822 : 12968 : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (stmt);
823 : : }
824 : 533366 : else if (gimple_vuse (stmt))
825 : : {
826 : 173466 : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).safe_insert (0, stmt);
827 : 173466 : if (dump_enabled_p ())
828 : 1764 : dump_printf_loc (MSG_NOTE, vect_location,
829 : : "marked statement for vUSE update: %G", stmt);
830 : : }
831 : : }
832 : :
833 : 234077 : if (!single_pred_p (bb))
834 : : {
835 : 95851 : gcc_assert (bb == loop->header);
836 : 95851 : break;
837 : : }
838 : :
839 : : /* If we possibly sink through a virtual PHI make sure to elide that. */
840 : 138226 : if (gphi *vphi = get_virtual_phi (bb))
841 : 62 : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).safe_push (vphi);
842 : :
843 : : /* All earlier blocks need dependence checking. */
844 : 138226 : check_deps = true;
845 : 138226 : bb = single_pred (bb);
846 : 138226 : }
847 : : while (1);
848 : :
849 : : /* We don't allow outer -> inner loop transitions which should have been
850 : : trapped already during loop form analysis. */
851 : 95851 : gcc_assert (dest_bb->loop_father == loop);
852 : :
853 : : /* Check that the destination block we picked has only one pred. To relax this we
854 : : have to take special care when moving the statements. We don't currently support
855 : : such control flow however this check is there to simplify how we handle
856 : : labels that may be present anywhere in the IL. This check is to ensure that the
857 : : labels aren't significant for the CFG. */
858 : 95851 : if (!single_pred (dest_bb))
859 : 0 : return opt_result::failure_at (vect_location,
860 : : "chosen loop exit block (BB %d) does not have a "
861 : : "single predecessor which is currently not "
862 : : "supported for early break vectorization.\n",
863 : : dest_bb->index);
864 : :
865 : 95851 : LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo) = dest_bb;
866 : :
867 : 95851 : if (!LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).is_empty ())
868 : : {
869 : : /* All uses shall be updated to that of the first load. Entries are
870 : : stored in reverse order. */
871 : 89331 : tree vuse = gimple_vuse (LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo).last ());
872 : 261333 : for (auto g : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
873 : : {
874 : 172002 : if (dump_enabled_p ())
875 : 1764 : dump_printf_loc (MSG_NOTE, vect_location,
876 : : "will update use: %T, mem_ref: %G", vuse, g);
877 : : }
878 : : }
879 : :
880 : 95851 : if (dump_enabled_p ())
881 : 1178 : dump_printf_loc (MSG_NOTE, vect_location,
882 : : "recorded statements to be moved to BB %d\n",
883 : 1178 : LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo)->index);
884 : :
885 : 95851 : return opt_result::success ();
886 : 102441 : }
887 : :
888 : : /* Function vect_analyze_data_ref_dependences.
889 : :
890 : : Examine all the data references in the loop, and make sure there do not
891 : : exist any data dependences between them. Set *MAX_VF according to
892 : : the maximum vectorization factor the data dependences allow. */
893 : :
894 : : opt_result
895 : 281165 : vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
896 : : unsigned int *max_vf)
897 : : {
898 : 281165 : unsigned int i;
899 : 281165 : struct data_dependence_relation *ddr;
900 : :
901 : 281165 : DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
902 : :
903 : 281165 : if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
904 : : {
905 : 137375 : LOOP_VINFO_DDRS (loop_vinfo)
906 : 137375 : .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
907 : 137375 : * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
908 : : /* We do not need read-read dependences. */
909 : 274750 : bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
910 : : &LOOP_VINFO_DDRS (loop_vinfo),
911 : 137375 : LOOP_VINFO_LOOP_NEST (loop_vinfo),
912 : : false);
913 : 137375 : gcc_assert (res);
914 : : }
915 : :
916 : 281165 : LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
917 : :
918 : : /* For epilogues we either have no aliases or alias versioning
919 : : was applied to original loop. Therefore we may just get max_vf
920 : : using VF of original loop. */
921 : 281165 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
922 : 24439 : *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
923 : : else
924 : 2014411 : FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
925 : : {
926 : 1771302 : opt_result res
927 : 1771302 : = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
928 : 1771302 : if (!res)
929 : 13617 : return res;
930 : : }
931 : :
932 : : /* If we have early break statements in the loop, check to see if they
933 : : are of a form we can vectorizer. */
934 : 267548 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
935 : 102441 : return vect_analyze_early_break_dependences (loop_vinfo);
936 : :
937 : 165107 : return opt_result::success ();
938 : : }
939 : :
940 : :
941 : : /* Function vect_slp_analyze_data_ref_dependence.
942 : :
943 : : Return TRUE if there (might) exist a dependence between a memory-reference
944 : : DRA and a memory-reference DRB for VINFO. When versioning for alias
945 : : may check a dependence at run-time, return FALSE. Adjust *MAX_VF
946 : : according to the data dependence. */
947 : :
948 : : static bool
949 : 6122114 : vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
950 : : struct data_dependence_relation *ddr)
951 : : {
952 : 6122114 : struct data_reference *dra = DDR_A (ddr);
953 : 6122114 : struct data_reference *drb = DDR_B (ddr);
954 : 6122114 : dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
955 : 6122114 : dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
956 : :
957 : : /* We need to check dependences of statements marked as unvectorizable
958 : : as well, they still can prohibit vectorization. */
959 : :
960 : : /* Independent data accesses. */
961 : 6122114 : if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
962 : : return false;
963 : :
964 : 1055416 : if (dra == drb)
965 : : return false;
966 : :
967 : : /* Read-read is OK. */
968 : 6111 : if (DR_IS_READ (dra) && DR_IS_READ (drb))
969 : : return false;
970 : :
971 : : /* If dra and drb are part of the same interleaving chain consider
972 : : them independent. */
973 : 6111 : if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
974 : 6111 : && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
975 : 6111 : == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
976 : : return false;
977 : :
978 : : /* Unknown data dependence. */
979 : 6111 : if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
980 : : {
981 : 6111 : if (dump_enabled_p ())
982 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
983 : : "can't determine dependence between %T and %T\n",
984 : : DR_REF (dra), DR_REF (drb));
985 : : }
986 : 0 : else if (dump_enabled_p ())
987 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
988 : : "determined dependence between %T and %T\n",
989 : : DR_REF (dra), DR_REF (drb));
990 : :
991 : : return true;
992 : : }
993 : :
994 : :
995 : : /* Analyze dependences involved in the transform of a store SLP NODE. */
996 : :
997 : : static bool
998 : 652688 : vect_slp_analyze_store_dependences (vec_info *vinfo, slp_tree node)
999 : : {
1000 : : /* This walks over all stmts involved in the SLP store done
1001 : : in NODE verifying we can sink them up to the last stmt in the
1002 : : group. */
1003 : 652688 : stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
1004 : 652688 : gcc_assert (DR_IS_WRITE (STMT_VINFO_DATA_REF (last_access_info)));
1005 : :
1006 : 2343174 : for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1007 : : {
1008 : 1696599 : stmt_vec_info access_info
1009 : 1696599 : = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1010 : 1696599 : if (access_info == last_access_info)
1011 : 647294 : continue;
1012 : 1049305 : data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1013 : 1049305 : ao_ref ref;
1014 : 1049305 : bool ref_initialized_p = false;
1015 : 1049305 : for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1016 : 9900992 : gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
1017 : : {
1018 : 8857800 : gimple *stmt = gsi_stmt (gsi);
1019 : 15538798 : if (! gimple_vuse (stmt))
1020 : 2659456 : continue;
1021 : :
1022 : : /* If we couldn't record a (single) data reference for this
1023 : : stmt we have to resort to the alias oracle. */
1024 : 6198344 : stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1025 : 6198344 : data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1026 : 6198344 : if (!dr_b)
1027 : : {
1028 : : /* We are moving a store - this means
1029 : : we cannot use TBAA for disambiguation. */
1030 : 77026 : if (!ref_initialized_p)
1031 : 77026 : ao_ref_init (&ref, DR_REF (dr_a));
1032 : 77026 : if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
1033 : 77026 : || ref_maybe_used_by_stmt_p (stmt, &ref, false))
1034 : 6113 : return false;
1035 : 76985 : continue;
1036 : : }
1037 : :
1038 : 6121318 : gcc_assert (!gimple_visited_p (stmt));
1039 : :
1040 : 6121318 : ddr_p ddr = initialize_data_dependence_relation (dr_a,
1041 : : dr_b, vNULL);
1042 : 6121318 : bool dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1043 : 6121318 : free_dependence_relation (ddr);
1044 : 6121318 : if (dependent)
1045 : : return false;
1046 : : }
1047 : : }
1048 : : return true;
1049 : : }
1050 : :
1051 : : /* Analyze dependences involved in the transform of a load SLP NODE. STORES
1052 : : contain the vector of scalar stores of this instance if we are
1053 : : disambiguating the loads. */
1054 : :
1055 : : static bool
1056 : 161614 : vect_slp_analyze_load_dependences (vec_info *vinfo, slp_tree node,
1057 : : vec<stmt_vec_info> stores,
1058 : : stmt_vec_info last_store_info)
1059 : : {
1060 : : /* This walks over all stmts involved in the SLP load done
1061 : : in NODE verifying we can hoist them up to the first stmt in the
1062 : : group. */
1063 : 161614 : stmt_vec_info first_access_info = vect_find_first_scalar_stmt_in_slp (node);
1064 : 161614 : gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (first_access_info)));
1065 : :
1066 : 555519 : for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
1067 : : {
1068 : 393954 : if (! SLP_TREE_SCALAR_STMTS (node)[k])
1069 : 165638 : continue;
1070 : 393954 : stmt_vec_info access_info
1071 : 393954 : = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (node)[k]);
1072 : 393954 : if (access_info == first_access_info)
1073 : 165638 : continue;
1074 : 228316 : data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
1075 : 228316 : ao_ref ref;
1076 : 228316 : bool ref_initialized_p = false;
1077 : 228316 : hash_set<stmt_vec_info> grp_visited;
1078 : 228316 : for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
1079 : 4683852 : gsi_stmt (gsi) != first_access_info->stmt; gsi_prev (&gsi))
1080 : : {
1081 : 2227817 : gimple *stmt = gsi_stmt (gsi);
1082 : 3582188 : if (! gimple_vdef (stmt))
1083 : 2142192 : continue;
1084 : :
1085 : 346091 : stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
1086 : :
1087 : : /* If we run into a store of this same instance (we've just
1088 : : marked those) then delay dependence checking until we run
1089 : : into the last store because this is where it will have
1090 : : been sunk to (and we verified that we can do that already). */
1091 : 346091 : if (gimple_visited_p (stmt))
1092 : : {
1093 : 260466 : if (stmt_info != last_store_info)
1094 : 260464 : continue;
1095 : :
1096 : 10 : for (stmt_vec_info &store_info : stores)
1097 : : {
1098 : 4 : data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
1099 : 4 : ddr_p ddr = initialize_data_dependence_relation
1100 : 4 : (dr_a, store_dr, vNULL);
1101 : 4 : bool dependent
1102 : 4 : = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1103 : 4 : free_dependence_relation (ddr);
1104 : 4 : if (dependent)
1105 : 49 : return false;
1106 : : }
1107 : 2 : continue;
1108 : 2 : }
1109 : :
1110 : 174643 : auto check_hoist = [&] (stmt_vec_info stmt_info) -> bool
1111 : : {
1112 : : /* We are hoisting a load - this means we can use TBAA for
1113 : : disambiguation. */
1114 : 89018 : if (!ref_initialized_p)
1115 : 89018 : ao_ref_init (&ref, DR_REF (dr_a));
1116 : 89018 : if (stmt_may_clobber_ref_p_1 (stmt_info->stmt, &ref, true))
1117 : : {
1118 : : /* If we couldn't record a (single) data reference for this
1119 : : stmt we have to give up now. */
1120 : 802 : data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
1121 : 802 : if (!dr_b)
1122 : : return false;
1123 : 792 : ddr_p ddr = initialize_data_dependence_relation (dr_a,
1124 : : dr_b, vNULL);
1125 : 792 : bool dependent
1126 : 792 : = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
1127 : 792 : free_dependence_relation (ddr);
1128 : 792 : if (dependent)
1129 : : return false;
1130 : : }
1131 : : /* No dependence. */
1132 : : return true;
1133 : 85625 : };
1134 : 85625 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1135 : : {
1136 : : /* When we run into a store group we have to honor
1137 : : that earlier stores might be moved here. We don't
1138 : : know exactly which and where to since we lack a
1139 : : back-mapping from DR to SLP node, so assume all
1140 : : earlier stores are sunk here. It's enough to
1141 : : consider the last stmt of a group for this.
1142 : : ??? Both this and the fact that we disregard that
1143 : : the conflicting instance might be removed later
1144 : : is overly conservative. */
1145 : 66586 : if (!grp_visited.add (DR_GROUP_FIRST_ELEMENT (stmt_info)))
1146 : 12590 : for (auto store_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
1147 : 156069 : store_info != NULL;
1148 : 143479 : store_info = DR_GROUP_NEXT_ELEMENT (store_info))
1149 : 143518 : if ((store_info == stmt_info
1150 : 130937 : || get_later_stmt (store_info, stmt_info) == stmt_info)
1151 : 200916 : && !check_hoist (store_info))
1152 : : return false;
1153 : : }
1154 : : else
1155 : : {
1156 : 19039 : if (!check_hoist (stmt_info))
1157 : : return false;
1158 : : }
1159 : : }
1160 : 228316 : }
1161 : : return true;
1162 : : }
1163 : :
1164 : :
1165 : : /* Function vect_analyze_data_ref_dependences.
1166 : :
1167 : : Examine all the data references in the basic-block, and make sure there
1168 : : do not exist any data dependences between them. Set *MAX_VF according to
1169 : : the maximum vectorization factor the data dependences allow. */
1170 : :
1171 : : bool
1172 : 763759 : vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
1173 : : {
1174 : 763759 : DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
1175 : :
1176 : : /* The stores of this instance are at the root of the SLP tree. */
1177 : 763759 : slp_tree store = NULL;
1178 : 763759 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store)
1179 : 652688 : store = SLP_INSTANCE_TREE (instance);
1180 : :
1181 : : /* Verify we can sink stores to the vectorized stmt insert location. */
1182 : 652688 : stmt_vec_info last_store_info = NULL;
1183 : 652688 : if (store)
1184 : : {
1185 : 652688 : if (! vect_slp_analyze_store_dependences (vinfo, store))
1186 : : return false;
1187 : :
1188 : : /* Mark stores in this instance and remember the last one. */
1189 : 646575 : last_store_info = vect_find_last_scalar_stmt_in_slp (store);
1190 : 2336311 : for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1191 : 1689736 : gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
1192 : : }
1193 : :
1194 : 757646 : bool res = true;
1195 : :
1196 : : /* Verify we can sink loads to the vectorized stmt insert location,
1197 : : special-casing stores of this instance. */
1198 : 1185669 : for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
1199 : 161614 : if (! vect_slp_analyze_load_dependences (vinfo, load,
1200 : : store
1201 : : ? SLP_TREE_SCALAR_STMTS (store)
1202 : : : vNULL, last_store_info))
1203 : : {
1204 : : res = false;
1205 : : break;
1206 : : }
1207 : :
1208 : : /* Unset the visited flag. */
1209 : 757646 : if (store)
1210 : 3100070 : for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
1211 : 1689736 : gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
1212 : :
1213 : : return res;
1214 : : }
1215 : :
1216 : : /* Return the misalignment of DR_INFO accessed in VECTYPE with OFFSET
1217 : : applied. */
1218 : :
1219 : : int
1220 : 6455432 : dr_misalignment (dr_vec_info *dr_info, tree vectype, poly_int64 offset)
1221 : : {
1222 : 6455432 : HOST_WIDE_INT diff = 0;
1223 : : /* Alignment is only analyzed for the first element of a DR group,
1224 : : use that but adjust misalignment by the offset of the access. */
1225 : 6455432 : if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
1226 : : {
1227 : 3252378 : dr_vec_info *first_dr
1228 : 3252378 : = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
1229 : : /* vect_analyze_data_ref_accesses guarantees that DR_INIT are
1230 : : INTEGER_CSTs and the first element in the group has the lowest
1231 : : address. */
1232 : 3252378 : diff = (TREE_INT_CST_LOW (DR_INIT (dr_info->dr))
1233 : 3252378 : - TREE_INT_CST_LOW (DR_INIT (first_dr->dr)));
1234 : 3252378 : gcc_assert (diff >= 0);
1235 : : dr_info = first_dr;
1236 : : }
1237 : :
1238 : 6455432 : int misalign = dr_info->misalignment;
1239 : 6455432 : gcc_assert (misalign != DR_MISALIGNMENT_UNINITIALIZED);
1240 : 6455432 : if (misalign == DR_MISALIGNMENT_UNKNOWN)
1241 : : return misalign;
1242 : :
1243 : : /* If the access is only aligned for a vector type with smaller alignment
1244 : : requirement the access has unknown misalignment. */
1245 : 4325387 : if (maybe_lt (dr_info->target_alignment * BITS_PER_UNIT,
1246 : 4325387 : targetm.vectorize.preferred_vector_alignment (vectype)))
1247 : : return DR_MISALIGNMENT_UNKNOWN;
1248 : :
1249 : : /* Apply the offset from the DR group start and the externally supplied
1250 : : offset which can for example result from a negative stride access. */
1251 : 4325382 : poly_int64 misalignment = misalign + diff + offset;
1252 : :
1253 : : /* Below we reject compile-time non-constant target alignments, but if
1254 : : our misalignment is zero, then we are known to already be aligned
1255 : : w.r.t. any such possible target alignment. */
1256 : 4325382 : if (known_eq (misalignment, 0))
1257 : : return 0;
1258 : :
1259 : 932011 : unsigned HOST_WIDE_INT target_alignment_c;
1260 : 932011 : if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1261 : 932011 : || !known_misalignment (misalignment, target_alignment_c, &misalign))
1262 : : return DR_MISALIGNMENT_UNKNOWN;
1263 : 932011 : return misalign;
1264 : : }
1265 : :
1266 : : /* Record the base alignment guarantee given by DRB, which occurs
1267 : : in STMT_INFO. */
1268 : :
1269 : : static void
1270 : 4164989 : vect_record_base_alignment (vec_info *vinfo, stmt_vec_info stmt_info,
1271 : : innermost_loop_behavior *drb)
1272 : : {
1273 : 4164989 : bool existed;
1274 : 4164989 : std::pair<stmt_vec_info, innermost_loop_behavior *> &entry
1275 : 4164989 : = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
1276 : 4164989 : if (!existed || entry.second->base_alignment < drb->base_alignment)
1277 : : {
1278 : 1295709 : entry = std::make_pair (stmt_info, drb);
1279 : 1295709 : if (dump_enabled_p ())
1280 : 34764 : dump_printf_loc (MSG_NOTE, vect_location,
1281 : : "recording new base alignment for %T\n"
1282 : : " alignment: %d\n"
1283 : : " misalignment: %d\n"
1284 : : " based on: %G",
1285 : : drb->base_address,
1286 : : drb->base_alignment,
1287 : : drb->base_misalignment,
1288 : : stmt_info->stmt);
1289 : : }
1290 : 4164989 : }
1291 : :
1292 : : /* If the region we're going to vectorize is reached, all unconditional
1293 : : data references occur at least once. We can therefore pool the base
1294 : : alignment guarantees from each unconditional reference. Do this by
1295 : : going through all the data references in VINFO and checking whether
1296 : : the containing statement makes the reference unconditionally. If so,
1297 : : record the alignment of the base address in VINFO so that it can be
1298 : : used for all other references with the same base. */
1299 : :
1300 : : void
1301 : 917228 : vect_record_base_alignments (vec_info *vinfo)
1302 : : {
1303 : 917228 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1304 : 321862 : class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
1305 : 13658857 : for (data_reference *dr : vinfo->shared->datarefs)
1306 : : {
1307 : 11003399 : dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1308 : 11003399 : stmt_vec_info stmt_info = dr_info->stmt;
1309 : 11003399 : if (!DR_IS_CONDITIONAL_IN_STMT (dr)
1310 : 10996299 : && STMT_VINFO_VECTORIZABLE (stmt_info)
1311 : 4173660 : && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1312 : : {
1313 : 4163504 : vect_record_base_alignment (vinfo, stmt_info, &DR_INNERMOST (dr));
1314 : :
1315 : : /* If DR is nested in the loop that is being vectorized, we can also
1316 : : record the alignment of the base wrt the outer loop. */
1317 : 11728215 : if (loop && nested_in_vect_loop_p (loop, stmt_info))
1318 : 1485 : vect_record_base_alignment
1319 : 1485 : (vinfo, stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
1320 : : }
1321 : : }
1322 : 917228 : }
1323 : :
1324 : : /* Function vect_compute_data_ref_alignment
1325 : :
1326 : : Compute the misalignment of the data reference DR_INFO when vectorizing
1327 : : with VECTYPE.
1328 : :
1329 : : RESULT is non-NULL iff VINFO is a loop_vec_info. In that case, *RESULT will
1330 : : be set appropriately on failure (but is otherwise left unchanged).
1331 : :
1332 : : Output:
1333 : : 1. initialized misalignment info for DR_INFO
1334 : :
1335 : : FOR NOW: No analysis is actually performed. Misalignment is calculated
1336 : : only for trivial cases. TODO. */
1337 : :
1338 : : static void
1339 : 1396010 : vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
1340 : : tree vectype, opt_result *result = nullptr)
1341 : : {
1342 : 1396010 : stmt_vec_info stmt_info = dr_info->stmt;
1343 : 1396010 : vec_base_alignments *base_alignments = &vinfo->base_alignments;
1344 : 1396010 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1345 : 1396010 : class loop *loop = NULL;
1346 : 1396010 : tree ref = DR_REF (dr_info->dr);
1347 : :
1348 : 1396010 : if (dump_enabled_p ())
1349 : 54243 : dump_printf_loc (MSG_NOTE, vect_location,
1350 : : "vect_compute_data_ref_alignment:\n");
1351 : :
1352 : 1396010 : if (loop_vinfo)
1353 : 604791 : loop = LOOP_VINFO_LOOP (loop_vinfo);
1354 : :
1355 : : /* Initialize misalignment to unknown. */
1356 : 1396010 : SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1357 : :
1358 : 1396010 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1359 : : return;
1360 : :
1361 : 1384077 : innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
1362 : 1384077 : bool step_preserves_misalignment_p;
1363 : :
1364 : 1384077 : poly_uint64 vector_alignment
1365 : 1384077 : = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
1366 : : BITS_PER_UNIT);
1367 : :
1368 : : /* If this DR needs peeling for alignment for correctness, we must
1369 : : ensure the target alignment is a constant power-of-two multiple of the
1370 : : amount read per vector iteration (overriding the above hook where
1371 : : necessary). */
1372 : 1384077 : if (dr_info->need_peeling_for_alignment)
1373 : : {
1374 : : /* Vector size in bytes. */
1375 : 107416 : poly_uint64 safe_align = tree_to_poly_uint64 (TYPE_SIZE_UNIT (vectype));
1376 : :
1377 : : /* We can only peel for loops, of course. */
1378 : 107416 : gcc_checking_assert (loop_vinfo);
1379 : :
1380 : : /* Calculate the number of vectors read per vector iteration. If
1381 : : it is a power of two, multiply through to get the required
1382 : : alignment in bytes. Otherwise, fail analysis since alignment
1383 : : peeling wouldn't work in such a case. */
1384 : 107416 : poly_uint64 num_scalars = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1385 : 107416 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1386 : 17279 : num_scalars *= DR_GROUP_SIZE (stmt_info);
1387 : :
1388 : 107416 : auto num_vectors = vect_get_num_vectors (num_scalars, vectype);
1389 : 207579 : if (!pow2p_hwi (num_vectors))
1390 : : {
1391 : 7253 : *result = opt_result::failure_at (vect_location,
1392 : : "non-power-of-two num vectors %u "
1393 : : "for DR needing peeling for "
1394 : : "alignment at %G",
1395 : : num_vectors, stmt_info->stmt);
1396 : 7253 : return;
1397 : : }
1398 : :
1399 : 100163 : safe_align *= num_vectors;
1400 : 100163 : if (maybe_gt (safe_align, 4096U))
1401 : : {
1402 : 0 : pretty_printer pp;
1403 : 0 : pp_wide_integer (&pp, safe_align);
1404 : 0 : *result = opt_result::failure_at (vect_location,
1405 : : "alignment required for correctness"
1406 : : " (%s) may exceed page size",
1407 : : pp_formatted_text (&pp));
1408 : 0 : return;
1409 : 0 : }
1410 : :
1411 : 100163 : unsigned HOST_WIDE_INT multiple;
1412 : 100163 : if (!constant_multiple_p (vector_alignment, safe_align, &multiple)
1413 : 182682 : || !pow2p_hwi (multiple))
1414 : : {
1415 : 17644 : if (dump_enabled_p ())
1416 : : {
1417 : 107 : dump_printf_loc (MSG_NOTE, vect_location,
1418 : : "forcing alignment for DR from preferred (");
1419 : 107 : dump_dec (MSG_NOTE, vector_alignment);
1420 : 107 : dump_printf (MSG_NOTE, ") to safe align (");
1421 : 107 : dump_dec (MSG_NOTE, safe_align);
1422 : 107 : dump_printf (MSG_NOTE, ") for stmt: %G", stmt_info->stmt);
1423 : : }
1424 : 17644 : vector_alignment = safe_align;
1425 : : }
1426 : : }
1427 : :
1428 : 1376824 : SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
1429 : :
1430 : : /* If the main loop has peeled for alignment we have no way of knowing
1431 : : whether the data accesses in the epilogues are aligned. We can't at
1432 : : compile time answer the question whether we have entered the main loop or
1433 : : not. Fixes PR 92351. */
1434 : 1376824 : if (loop_vinfo)
1435 : : {
1436 : 585605 : loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1437 : 585605 : if (orig_loop_vinfo
1438 : 57847 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
1439 : : return;
1440 : : }
1441 : :
1442 : 1376369 : unsigned HOST_WIDE_INT vect_align_c;
1443 : 1376369 : if (!vector_alignment.is_constant (&vect_align_c))
1444 : : return;
1445 : :
1446 : : /* No step for BB vectorization. */
1447 : 1376369 : if (!loop)
1448 : : {
1449 : 791219 : gcc_assert (integer_zerop (drb->step));
1450 : : step_preserves_misalignment_p = true;
1451 : : }
1452 : :
1453 : : else
1454 : : {
1455 : : /* We can only use base and misalignment information relative to
1456 : : an innermost loop if the misalignment stays the same throughout the
1457 : : execution of the loop. As above, this is the case if the stride of
1458 : : the dataref evenly divides by the alignment. */
1459 : 585150 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1460 : 585150 : step_preserves_misalignment_p
1461 : 585150 : = multiple_p (drb->step_alignment * vf, vect_align_c);
1462 : :
1463 : 585150 : if (!step_preserves_misalignment_p && dump_enabled_p ())
1464 : 293 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465 : : "step doesn't divide the vector alignment.\n");
1466 : :
1467 : : /* In case the dataref is in an inner-loop of the loop that is being
1468 : : vectorized (LOOP), we use the base and misalignment information
1469 : : relative to the outer-loop (LOOP). This is ok only if the
1470 : : misalignment stays the same throughout the execution of the
1471 : : inner-loop, which is why we have to check that the stride of the
1472 : : dataref in the inner-loop evenly divides by the vector alignment. */
1473 : 585150 : if (step_preserves_misalignment_p
1474 : 585150 : && nested_in_vect_loop_p (loop, stmt_info))
1475 : : {
1476 : 1484 : step_preserves_misalignment_p
1477 : 1484 : = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
1478 : :
1479 : 1484 : if (dump_enabled_p ())
1480 : : {
1481 : 517 : if (step_preserves_misalignment_p)
1482 : 371 : dump_printf_loc (MSG_NOTE, vect_location,
1483 : : "inner step divides the vector alignment.\n");
1484 : : else
1485 : 146 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1486 : : "inner step doesn't divide the vector"
1487 : : " alignment.\n");
1488 : : }
1489 : : }
1490 : : }
1491 : :
1492 : 1376369 : unsigned int base_alignment = drb->base_alignment;
1493 : 1376369 : unsigned int base_misalignment = drb->base_misalignment;
1494 : :
1495 : : /* Calculate the maximum of the pooled base address alignment and the
1496 : : alignment that we can compute for DR itself. */
1497 : 1376369 : std::pair<stmt_vec_info, innermost_loop_behavior *> *entry
1498 : 1376369 : = base_alignments->get (drb->base_address);
1499 : 1376369 : if (entry
1500 : 1372637 : && base_alignment < (*entry).second->base_alignment
1501 : 1377257 : && (loop_vinfo
1502 : 734 : || (dominated_by_p (CDI_DOMINATORS, gimple_bb (stmt_info->stmt),
1503 : 734 : gimple_bb (entry->first->stmt))
1504 : 613 : && (gimple_bb (stmt_info->stmt) != gimple_bb (entry->first->stmt)
1505 : 453 : || (entry->first->dr_aux.group <= dr_info->group)))))
1506 : : {
1507 : 750 : base_alignment = entry->second->base_alignment;
1508 : 750 : base_misalignment = entry->second->base_misalignment;
1509 : : }
1510 : :
1511 : 1376369 : if (drb->offset_alignment < vect_align_c
1512 : 1309828 : || !step_preserves_misalignment_p
1513 : : /* We need to know whether the step wrt the vectorized loop is
1514 : : negative when computing the starting misalignment below. */
1515 : 1305184 : || TREE_CODE (drb->step) != INTEGER_CST)
1516 : : {
1517 : 94434 : if (dump_enabled_p ())
1518 : 4070 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1519 : : "Unknown alignment for access: %T\n", ref);
1520 : 94434 : return;
1521 : : }
1522 : :
1523 : 1281935 : if (base_alignment < vect_align_c)
1524 : : {
1525 : 585819 : unsigned int max_alignment;
1526 : 585819 : tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1527 : 585819 : if (max_alignment < vect_align_c
1528 : 585819 : || !vect_can_force_dr_alignment_p (base,
1529 : 585623 : vect_align_c * BITS_PER_UNIT))
1530 : : {
1531 : 402344 : if (dump_enabled_p ())
1532 : 15463 : dump_printf_loc (MSG_NOTE, vect_location,
1533 : : "can't force alignment of ref: %T\n", ref);
1534 : 402344 : return;
1535 : : }
1536 : :
1537 : : /* Force the alignment of the decl.
1538 : : NOTE: This is the only change to the code we make during
1539 : : the analysis phase, before deciding to vectorize the loop. */
1540 : 183475 : if (dump_enabled_p ())
1541 : 7857 : dump_printf_loc (MSG_NOTE, vect_location,
1542 : : "force alignment of %T\n", ref);
1543 : :
1544 : 183475 : dr_info->base_decl = base;
1545 : 183475 : dr_info->base_misaligned = true;
1546 : 183475 : base_misalignment = 0;
1547 : : }
1548 : 879591 : poly_int64 misalignment
1549 : 879591 : = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1550 : :
1551 : 879591 : unsigned int const_misalignment;
1552 : 879591 : if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1553 : : {
1554 : : if (dump_enabled_p ())
1555 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556 : : "Non-constant misalignment for access: %T\n", ref);
1557 : : return;
1558 : : }
1559 : :
1560 : 879591 : SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1561 : :
1562 : 879591 : if (dump_enabled_p ())
1563 : 33193 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1564 : : "misalign = %d bytes of ref %T\n",
1565 : : const_misalignment, ref);
1566 : :
1567 : : return;
1568 : : }
1569 : :
1570 : : /* Return whether DR_INFO, which is related to DR_PEEL_INFO in
1571 : : that it only differs in DR_INIT, is aligned if DR_PEEL_INFO
1572 : : is made aligned via peeling. */
1573 : :
1574 : : static bool
1575 : 3104158 : vect_dr_aligned_if_related_peeled_dr_is (dr_vec_info *dr_info,
1576 : : dr_vec_info *dr_peel_info)
1577 : : {
1578 : 3104158 : if (multiple_p (DR_TARGET_ALIGNMENT (dr_peel_info),
1579 : 3104627 : DR_TARGET_ALIGNMENT (dr_info)))
1580 : : {
1581 : 3103689 : poly_offset_int diff
1582 : 3103689 : = (wi::to_poly_offset (DR_INIT (dr_peel_info->dr))
1583 : 3103689 : - wi::to_poly_offset (DR_INIT (dr_info->dr)));
1584 : 3103689 : if (known_eq (diff, 0)
1585 : 3103689 : || multiple_p (diff, DR_TARGET_ALIGNMENT (dr_info)))
1586 : 1405180 : return true;
1587 : : }
1588 : : return false;
1589 : : }
1590 : :
1591 : : /* Return whether DR_INFO is aligned if DR_PEEL_INFO is made
1592 : : aligned via peeling. */
1593 : :
1594 : : static bool
1595 : 140121 : vect_dr_aligned_if_peeled_dr_is (dr_vec_info *dr_info,
1596 : : dr_vec_info *dr_peel_info)
1597 : : {
1598 : 140121 : if (!operand_equal_p (DR_BASE_ADDRESS (dr_info->dr),
1599 : 140121 : DR_BASE_ADDRESS (dr_peel_info->dr), 0)
1600 : 46515 : || !operand_equal_p (DR_OFFSET (dr_info->dr),
1601 : 46515 : DR_OFFSET (dr_peel_info->dr), 0)
1602 : 185836 : || !operand_equal_p (DR_STEP (dr_info->dr),
1603 : 45715 : DR_STEP (dr_peel_info->dr), 0))
1604 : 94749 : return false;
1605 : :
1606 : 45372 : return vect_dr_aligned_if_related_peeled_dr_is (dr_info, dr_peel_info);
1607 : : }
1608 : :
1609 : : /* Compute the value for dr_info->misalign so that the access appears
1610 : : aligned. This is used by peeling to compensate for dr_misalignment
1611 : : applying the offset for negative step. */
1612 : :
1613 : : int
1614 : 13644 : vect_dr_misalign_for_aligned_access (dr_vec_info *dr_info)
1615 : : {
1616 : 13644 : if (tree_int_cst_sgn (DR_STEP (dr_info->dr)) >= 0)
1617 : : return 0;
1618 : :
1619 : 124 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1620 : 124 : poly_int64 misalignment
1621 : 124 : = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1622 : 124 : * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1623 : :
1624 : 124 : unsigned HOST_WIDE_INT target_alignment_c;
1625 : 124 : int misalign;
1626 : 124 : if (!dr_info->target_alignment.is_constant (&target_alignment_c)
1627 : 124 : || !known_misalignment (misalignment, target_alignment_c, &misalign))
1628 : : return DR_MISALIGNMENT_UNKNOWN;
1629 : 124 : return misalign;
1630 : : }
1631 : :
1632 : : /* Function vect_update_misalignment_for_peel.
1633 : : Sets DR_INFO's misalignment
1634 : : - to 0 if it has the same alignment as DR_PEEL_INFO,
1635 : : - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1636 : : - to -1 (unknown) otherwise.
1637 : :
1638 : : DR_INFO - the data reference whose misalignment is to be adjusted.
1639 : : DR_PEEL_INFO - the data reference whose misalignment is being made
1640 : : zero in the vector loop by the peel.
1641 : : NPEEL - the number of iterations in the peel loop if the misalignment
1642 : : of DR_PEEL_INFO is known at compile time. */
1643 : :
1644 : : static void
1645 : 1606 : vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1646 : : dr_vec_info *dr_peel_info, int npeel)
1647 : : {
1648 : : /* If dr_info is aligned of dr_peel_info is, then mark it so. */
1649 : 1606 : if (vect_dr_aligned_if_peeled_dr_is (dr_info, dr_peel_info))
1650 : : {
1651 : 403 : SET_DR_MISALIGNMENT (dr_info,
1652 : : vect_dr_misalign_for_aligned_access (dr_peel_info));
1653 : 403 : return;
1654 : : }
1655 : :
1656 : 1203 : unsigned HOST_WIDE_INT alignment;
1657 : 1203 : if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1658 : 1203 : && known_alignment_for_access_p (dr_info,
1659 : 1203 : STMT_VINFO_VECTYPE (dr_info->stmt))
1660 : 958 : && known_alignment_for_access_p (dr_peel_info,
1661 : 958 : STMT_VINFO_VECTYPE (dr_peel_info->stmt)))
1662 : : {
1663 : 180 : int misal = dr_info->misalignment;
1664 : 180 : misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1665 : 180 : misal &= alignment - 1;
1666 : 180 : set_dr_misalignment (dr_info, misal);
1667 : 180 : return;
1668 : : }
1669 : :
1670 : 1023 : if (dump_enabled_p ())
1671 : 25 : dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1672 : : "to unknown (-1).\n");
1673 : 1023 : SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1674 : : }
1675 : :
1676 : : /* Return true if alignment is relevant for DR_INFO. */
1677 : :
1678 : : static bool
1679 : 1284690 : vect_relevant_for_alignment_p (dr_vec_info *dr_info)
1680 : : {
1681 : 1284690 : stmt_vec_info stmt_info = dr_info->stmt;
1682 : :
1683 : 1284690 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
1684 : : return false;
1685 : :
1686 : : /* For interleaving, only the alignment of the first access matters. */
1687 : 1283954 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1688 : 1468531 : && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1689 : : return false;
1690 : :
1691 : : /* Scatter-gather and invariant accesses continue to address individual
1692 : : scalars, so vector-level alignment is irrelevant. */
1693 : 1195505 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1694 : 1195505 : || integer_zerop (DR_STEP (dr_info->dr)))
1695 : 27803 : return false;
1696 : :
1697 : : /* Strided accesses perform only component accesses, alignment is
1698 : : irrelevant for them. */
1699 : 1167702 : if (STMT_VINFO_STRIDED_P (stmt_info)
1700 : 1167702 : && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1701 : : return false;
1702 : :
1703 : : return true;
1704 : : }
1705 : :
1706 : : /* Given an memory reference EXP return whether its alignment is less
1707 : : than its size. */
1708 : :
1709 : : static bool
1710 : 1172179 : not_size_aligned (tree exp)
1711 : : {
1712 : 1172179 : if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1713 : : return true;
1714 : :
1715 : 1172179 : return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1716 : 1172179 : > get_object_alignment (exp));
1717 : : }
1718 : :
1719 : : /* Function vector_alignment_reachable_p
1720 : :
1721 : : Return true if vector alignment for DR_INFO is reachable by peeling
1722 : : a few loop iterations. Return false otherwise. */
1723 : :
1724 : : static bool
1725 : 426641 : vector_alignment_reachable_p (dr_vec_info *dr_info)
1726 : : {
1727 : 426641 : stmt_vec_info stmt_info = dr_info->stmt;
1728 : 426641 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1729 : :
1730 : 426641 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1731 : : {
1732 : : /* For interleaved access we peel only if number of iterations in
1733 : : the prolog loop ({VF - misalignment}), is a multiple of the
1734 : : number of the interleaved accesses. */
1735 : 51799 : int elem_size, mis_in_elements;
1736 : :
1737 : : /* FORNOW: handle only known alignment. */
1738 : 51799 : if (!known_alignment_for_access_p (dr_info, vectype))
1739 : 41375 : return false;
1740 : :
1741 : 30719 : poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1742 : 61438 : poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1743 : 30719 : elem_size = vector_element_size (vector_size, nelements);
1744 : 30719 : mis_in_elements = dr_misalignment (dr_info, vectype) / elem_size;
1745 : :
1746 : 41143 : if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1747 : : return false;
1748 : : }
1749 : :
1750 : : /* If misalignment is known at the compile time then allow peeling
1751 : : only if natural alignment is reachable through peeling. */
1752 : 385266 : if (known_alignment_for_access_p (dr_info, vectype)
1753 : 602394 : && !aligned_access_p (dr_info, vectype))
1754 : : {
1755 : 11134 : HOST_WIDE_INT elmsize =
1756 : 11134 : int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1757 : 11134 : if (dump_enabled_p ())
1758 : : {
1759 : 731 : dump_printf_loc (MSG_NOTE, vect_location,
1760 : : "data size = %wd. misalignment = %d.\n", elmsize,
1761 : 731 : dr_misalignment (dr_info, vectype));
1762 : : }
1763 : 11134 : if (dr_misalignment (dr_info, vectype) % elmsize)
1764 : : {
1765 : 34 : if (dump_enabled_p ())
1766 : 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1767 : : "data size does not divide the misalignment.\n");
1768 : 34 : return false;
1769 : : }
1770 : : }
1771 : :
1772 : 385232 : if (!known_alignment_for_access_p (dr_info, vectype))
1773 : : {
1774 : 168138 : tree type = TREE_TYPE (DR_REF (dr_info->dr));
1775 : 168138 : bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1776 : 168138 : if (dump_enabled_p ())
1777 : 14009 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1778 : : "Unknown misalignment, %snaturally aligned\n",
1779 : : is_packed ? "not " : "");
1780 : 168138 : return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1781 : : }
1782 : :
1783 : : return true;
1784 : : }
1785 : :
1786 : :
1787 : : /* Calculate the cost of the memory access represented by DR_INFO. */
1788 : :
1789 : : static void
1790 : 513116 : vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
1791 : : dr_alignment_support alignment_support_scheme,
1792 : : int misalignment,
1793 : : unsigned int *inside_cost,
1794 : : unsigned int *outside_cost,
1795 : : stmt_vector_for_cost *body_cost_vec,
1796 : : stmt_vector_for_cost *prologue_cost_vec)
1797 : : {
1798 : 513116 : stmt_vec_info stmt_info = dr_info->stmt;
1799 : 513116 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
1800 : 513116 : int ncopies;
1801 : :
1802 : 513116 : if (PURE_SLP_STMT (stmt_info))
1803 : : ncopies = 1;
1804 : : else
1805 : 15218 : ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1806 : :
1807 : 513116 : if (DR_IS_READ (dr_info->dr))
1808 : 357952 : vect_get_load_cost (vinfo, stmt_info, NULL, ncopies,
1809 : : alignment_support_scheme, misalignment, true,
1810 : : inside_cost, outside_cost, prologue_cost_vec,
1811 : : body_cost_vec, false);
1812 : : else
1813 : 155164 : vect_get_store_cost (vinfo,stmt_info, NULL, ncopies,
1814 : : alignment_support_scheme, misalignment, inside_cost,
1815 : : body_cost_vec);
1816 : :
1817 : 513116 : if (dump_enabled_p ())
1818 : 26598 : dump_printf_loc (MSG_NOTE, vect_location,
1819 : : "vect_get_data_access_cost: inside_cost = %d, "
1820 : : "outside_cost = %d.\n", *inside_cost, *outside_cost);
1821 : 513116 : }
1822 : :
1823 : :
1824 : : typedef struct _vect_peel_info
1825 : : {
1826 : : dr_vec_info *dr_info;
1827 : : int npeel;
1828 : : unsigned int count;
1829 : : } *vect_peel_info;
1830 : :
1831 : : typedef struct _vect_peel_extended_info
1832 : : {
1833 : : vec_info *vinfo;
1834 : : struct _vect_peel_info peel_info;
1835 : : unsigned int inside_cost;
1836 : : unsigned int outside_cost;
1837 : : } *vect_peel_extended_info;
1838 : :
1839 : :
1840 : : /* Peeling hashtable helpers. */
1841 : :
1842 : : struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1843 : : {
1844 : : static inline hashval_t hash (const _vect_peel_info *);
1845 : : static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1846 : : };
1847 : :
1848 : : inline hashval_t
1849 : 656834 : peel_info_hasher::hash (const _vect_peel_info *peel_info)
1850 : : {
1851 : 656834 : return (hashval_t) peel_info->npeel;
1852 : : }
1853 : :
1854 : : inline bool
1855 : 376916 : peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1856 : : {
1857 : 376916 : return (a->npeel == b->npeel);
1858 : : }
1859 : :
1860 : :
1861 : : /* Insert DR_INFO into peeling hash table with NPEEL as key. */
1862 : :
1863 : : static void
1864 : 280694 : vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1865 : : loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1866 : : int npeel, bool supportable_if_not_aligned)
1867 : : {
1868 : 280694 : struct _vect_peel_info elem, *slot;
1869 : 280694 : _vect_peel_info **new_slot;
1870 : :
1871 : 280694 : elem.npeel = npeel;
1872 : 280694 : slot = peeling_htab->find (&elem);
1873 : 280694 : if (slot)
1874 : 121379 : slot->count++;
1875 : : else
1876 : : {
1877 : 159315 : slot = XNEW (struct _vect_peel_info);
1878 : 159315 : slot->npeel = npeel;
1879 : 159315 : slot->dr_info = dr_info;
1880 : 159315 : slot->count = 1;
1881 : 159315 : new_slot = peeling_htab->find_slot (slot, INSERT);
1882 : 159315 : *new_slot = slot;
1883 : : }
1884 : :
1885 : : /* If this DR is not supported with unknown misalignment then bias
1886 : : this slot when the cost model is disabled. */
1887 : 280694 : if (!supportable_if_not_aligned
1888 : 280694 : && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1889 : 65 : slot->count += VECT_MAX_COST;
1890 : 280694 : }
1891 : :
1892 : :
1893 : : /* Traverse peeling hash table to find peeling option that aligns maximum
1894 : : number of data accesses. */
1895 : :
1896 : : int
1897 : 38128 : vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1898 : : _vect_peel_extended_info *max)
1899 : : {
1900 : 38128 : vect_peel_info elem = *slot;
1901 : :
1902 : 38128 : if (elem->count > max->peel_info.count
1903 : 23605 : || (elem->count == max->peel_info.count
1904 : 18617 : && max->peel_info.npeel > elem->npeel))
1905 : : {
1906 : 14535 : max->peel_info.npeel = elem->npeel;
1907 : 14535 : max->peel_info.count = elem->count;
1908 : 14535 : max->peel_info.dr_info = elem->dr_info;
1909 : : }
1910 : :
1911 : 38128 : return 1;
1912 : : }
1913 : :
1914 : : /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1915 : : data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true,
1916 : : npeel is computed at runtime but DR0_INFO's misalignment will be zero
1917 : : after peeling. */
1918 : :
1919 : : static void
1920 : 285920 : vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1921 : : dr_vec_info *dr0_info,
1922 : : unsigned int *inside_cost,
1923 : : unsigned int *outside_cost,
1924 : : stmt_vector_for_cost *body_cost_vec,
1925 : : stmt_vector_for_cost *prologue_cost_vec,
1926 : : unsigned int npeel)
1927 : : {
1928 : 285920 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1929 : :
1930 : 285920 : bool dr0_alignment_known_p
1931 : : = (dr0_info
1932 : 522488 : && known_alignment_for_access_p (dr0_info,
1933 : 236568 : STMT_VINFO_VECTYPE (dr0_info->stmt)));
1934 : :
1935 : 1399833 : for (data_reference *dr : datarefs)
1936 : : {
1937 : 542073 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1938 : 542073 : if (!vect_relevant_for_alignment_p (dr_info))
1939 : 28957 : continue;
1940 : :
1941 : 513116 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
1942 : 513116 : dr_alignment_support alignment_support_scheme;
1943 : 513116 : int misalignment;
1944 : 513116 : unsigned HOST_WIDE_INT alignment;
1945 : :
1946 : 513116 : bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1947 : 513116 : size_zero_node) < 0;
1948 : 513116 : poly_int64 off = 0;
1949 : 513116 : if (negative)
1950 : 18913 : off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1951 : 18913 : * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1952 : :
1953 : 513116 : if (npeel == 0)
1954 : 259468 : misalignment = dr_misalignment (dr_info, vectype, off);
1955 : 253648 : else if (dr_info == dr0_info
1956 : 253648 : || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
1957 : : misalignment = 0;
1958 : 80754 : else if (!dr0_alignment_known_p
1959 : 6551 : || !known_alignment_for_access_p (dr_info, vectype)
1960 : 87305 : || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
1961 : : misalignment = DR_MISALIGNMENT_UNKNOWN;
1962 : : else
1963 : : {
1964 : 5645 : misalignment = dr_misalignment (dr_info, vectype, off);
1965 : 5645 : misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1966 : 5645 : misalignment &= alignment - 1;
1967 : : }
1968 : 513116 : alignment_support_scheme
1969 : 513116 : = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
1970 : : misalignment);
1971 : :
1972 : 513116 : vect_get_data_access_cost (loop_vinfo, dr_info,
1973 : : alignment_support_scheme, misalignment,
1974 : : inside_cost, outside_cost,
1975 : : body_cost_vec, prologue_cost_vec);
1976 : : }
1977 : 285920 : }
1978 : :
1979 : : /* Traverse peeling hash table and calculate cost for each peeling option.
1980 : : Find the one with the lowest cost. */
1981 : :
1982 : : int
1983 : 108097 : vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1984 : : _vect_peel_extended_info *min)
1985 : : {
1986 : 108097 : vect_peel_info elem = *slot;
1987 : 108097 : int dummy;
1988 : 108097 : unsigned int inside_cost = 0, outside_cost = 0;
1989 : 108097 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (min->vinfo);
1990 : 108097 : stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1991 : : epilogue_cost_vec;
1992 : :
1993 : 108097 : prologue_cost_vec.create (2);
1994 : 108097 : body_cost_vec.create (2);
1995 : 108097 : epilogue_cost_vec.create (2);
1996 : :
1997 : 108097 : vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1998 : : &outside_cost, &body_cost_vec,
1999 : 108097 : &prologue_cost_vec, elem->npeel);
2000 : :
2001 : 108097 : body_cost_vec.release ();
2002 : :
2003 : 216194 : outside_cost += vect_get_known_peeling_cost
2004 : 108097 : (loop_vinfo, elem->npeel, &dummy,
2005 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2006 : : &prologue_cost_vec, &epilogue_cost_vec);
2007 : :
2008 : : /* Prologue and epilogue costs are added to the target model later.
2009 : : These costs depend only on the scalar iteration cost, the
2010 : : number of peeling iterations finally chosen, and the number of
2011 : : misaligned statements. So discard the information found here. */
2012 : 108097 : prologue_cost_vec.release ();
2013 : 108097 : epilogue_cost_vec.release ();
2014 : :
2015 : 108097 : if (inside_cost < min->inside_cost
2016 : 1859 : || (inside_cost == min->inside_cost
2017 : 1791 : && outside_cost < min->outside_cost))
2018 : : {
2019 : 106280 : min->inside_cost = inside_cost;
2020 : 106280 : min->outside_cost = outside_cost;
2021 : 106280 : min->peel_info.dr_info = elem->dr_info;
2022 : 106280 : min->peel_info.npeel = elem->npeel;
2023 : 106280 : min->peel_info.count = elem->count;
2024 : : }
2025 : :
2026 : 108097 : return 1;
2027 : : }
2028 : :
2029 : :
2030 : : /* Choose best peeling option by traversing peeling hash table and either
2031 : : choosing an option with the lowest cost (if cost model is enabled) or the
2032 : : option that aligns as many accesses as possible. */
2033 : :
2034 : : static struct _vect_peel_extended_info
2035 : 120277 : vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
2036 : : loop_vec_info loop_vinfo)
2037 : : {
2038 : 120277 : struct _vect_peel_extended_info res;
2039 : :
2040 : 120277 : res.peel_info.dr_info = NULL;
2041 : 120277 : res.vinfo = loop_vinfo;
2042 : :
2043 : 120277 : if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2044 : : {
2045 : 105806 : res.inside_cost = INT_MAX;
2046 : 105806 : res.outside_cost = INT_MAX;
2047 : 105806 : peeling_htab->traverse <_vect_peel_extended_info *,
2048 : 213903 : vect_peeling_hash_get_lowest_cost> (&res);
2049 : : }
2050 : : else
2051 : : {
2052 : 14471 : res.peel_info.count = 0;
2053 : 14471 : peeling_htab->traverse <_vect_peel_extended_info *,
2054 : 52599 : vect_peeling_hash_get_most_frequent> (&res);
2055 : 14471 : res.inside_cost = 0;
2056 : 14471 : res.outside_cost = 0;
2057 : : }
2058 : :
2059 : 120277 : return res;
2060 : : }
2061 : :
2062 : : /* Return true if the new peeling NPEEL is supported. */
2063 : :
2064 : : static bool
2065 : 44231 : vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
2066 : : unsigned npeel)
2067 : : {
2068 : 44231 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2069 : 44231 : enum dr_alignment_support supportable_dr_alignment;
2070 : :
2071 : 44231 : bool dr0_alignment_known_p
2072 : 88462 : = known_alignment_for_access_p (dr0_info,
2073 : 44231 : STMT_VINFO_VECTYPE (dr0_info->stmt));
2074 : :
2075 : : /* Ensure that all data refs can be vectorized after the peel. */
2076 : 172857 : for (data_reference *dr : datarefs)
2077 : : {
2078 : 56397 : if (dr == dr0_info->dr)
2079 : 35472 : continue;
2080 : :
2081 : 20925 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2082 : 20925 : if (!vect_relevant_for_alignment_p (dr_info)
2083 : 20925 : || vect_dr_aligned_if_peeled_dr_is (dr_info, dr0_info))
2084 : 1322 : continue;
2085 : :
2086 : 19603 : tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2087 : 19603 : int misalignment;
2088 : 19603 : unsigned HOST_WIDE_INT alignment;
2089 : 19603 : if (!dr0_alignment_known_p
2090 : 661 : || !known_alignment_for_access_p (dr_info, vectype)
2091 : 20264 : || !DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment))
2092 : : misalignment = DR_MISALIGNMENT_UNKNOWN;
2093 : : else
2094 : : {
2095 : 647 : misalignment = dr_misalignment (dr_info, vectype);
2096 : 647 : misalignment += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
2097 : 647 : misalignment &= alignment - 1;
2098 : : }
2099 : 19603 : supportable_dr_alignment
2100 : 19603 : = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2101 : : misalignment);
2102 : 19603 : if (supportable_dr_alignment == dr_unaligned_unsupported)
2103 : 44231 : return false;
2104 : : }
2105 : :
2106 : : return true;
2107 : : }
2108 : :
2109 : : /* Compare two data-references DRA and DRB to group them into chunks
2110 : : with related alignment. */
2111 : :
2112 : : static int
2113 : 3140020 : dr_align_group_sort_cmp (const void *dra_, const void *drb_)
2114 : : {
2115 : 3140020 : data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2116 : 3140020 : data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2117 : 3140020 : int cmp;
2118 : :
2119 : : /* Stabilize sort. */
2120 : 3140020 : if (dra == drb)
2121 : : return 0;
2122 : :
2123 : : /* Ordering of DRs according to base. */
2124 : 3140020 : cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2125 : : DR_BASE_ADDRESS (drb));
2126 : 3140020 : if (cmp != 0)
2127 : : return cmp;
2128 : :
2129 : : /* And according to DR_OFFSET. */
2130 : 1543602 : cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2131 : 1543602 : if (cmp != 0)
2132 : : return cmp;
2133 : :
2134 : : /* And after step. */
2135 : 1534058 : cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2136 : 1534058 : if (cmp != 0)
2137 : : return cmp;
2138 : :
2139 : : /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2140 : 1529879 : cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2141 : 1529879 : if (cmp == 0)
2142 : 200716 : return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2143 : : return cmp;
2144 : : }
2145 : :
2146 : : /* Function vect_enhance_data_refs_alignment
2147 : :
2148 : : This pass will use loop versioning and loop peeling in order to enhance
2149 : : the alignment of data references in the loop.
2150 : :
2151 : : FOR NOW: we assume that whatever versioning/peeling takes place, only the
2152 : : original loop is to be vectorized. Any other loops that are created by
2153 : : the transformations performed in this pass - are not supposed to be
2154 : : vectorized. This restriction will be relaxed.
2155 : :
2156 : : This pass will require a cost model to guide it whether to apply peeling
2157 : : or versioning or a combination of the two. For example, the scheme that
2158 : : intel uses when given a loop with several memory accesses, is as follows:
2159 : : choose one memory access ('p') which alignment you want to force by doing
2160 : : peeling. Then, either (1) generate a loop in which 'p' is aligned and all
2161 : : other accesses are not necessarily aligned, or (2) use loop versioning to
2162 : : generate one loop in which all accesses are aligned, and another loop in
2163 : : which only 'p' is necessarily aligned.
2164 : :
2165 : : ("Automatic Intra-Register Vectorization for the Intel Architecture",
2166 : : Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
2167 : : Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
2168 : :
2169 : : Devising a cost model is the most critical aspect of this work. It will
2170 : : guide us on which access to peel for, whether to use loop versioning, how
2171 : : many versions to create, etc. The cost model will probably consist of
2172 : : generic considerations as well as target specific considerations (on
2173 : : powerpc for example, misaligned stores are more painful than misaligned
2174 : : loads).
2175 : :
2176 : : Here are the general steps involved in alignment enhancements:
2177 : :
2178 : : -- original loop, before alignment analysis:
2179 : : for (i=0; i<N; i++){
2180 : : x = q[i]; # DR_MISALIGNMENT(q) = unknown
2181 : : p[i] = y; # DR_MISALIGNMENT(p) = unknown
2182 : : }
2183 : :
2184 : : -- After vect_compute_data_refs_alignment:
2185 : : for (i=0; i<N; i++){
2186 : : x = q[i]; # DR_MISALIGNMENT(q) = 3
2187 : : p[i] = y; # DR_MISALIGNMENT(p) = unknown
2188 : : }
2189 : :
2190 : : -- Possibility 1: we do loop versioning:
2191 : : if (p is aligned) {
2192 : : for (i=0; i<N; i++){ # loop 1A
2193 : : x = q[i]; # DR_MISALIGNMENT(q) = 3
2194 : : p[i] = y; # DR_MISALIGNMENT(p) = 0
2195 : : }
2196 : : }
2197 : : else {
2198 : : for (i=0; i<N; i++){ # loop 1B
2199 : : x = q[i]; # DR_MISALIGNMENT(q) = 3
2200 : : p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2201 : : }
2202 : : }
2203 : :
2204 : : -- Possibility 2: we do loop peeling:
2205 : : for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2206 : : x = q[i];
2207 : : p[i] = y;
2208 : : }
2209 : : for (i = 3; i < N; i++){ # loop 2A
2210 : : x = q[i]; # DR_MISALIGNMENT(q) = 0
2211 : : p[i] = y; # DR_MISALIGNMENT(p) = unknown
2212 : : }
2213 : :
2214 : : -- Possibility 3: combination of loop peeling and versioning:
2215 : : for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
2216 : : x = q[i];
2217 : : p[i] = y;
2218 : : }
2219 : : if (p is aligned) {
2220 : : for (i = 3; i<N; i++){ # loop 3A
2221 : : x = q[i]; # DR_MISALIGNMENT(q) = 0
2222 : : p[i] = y; # DR_MISALIGNMENT(p) = 0
2223 : : }
2224 : : }
2225 : : else {
2226 : : for (i = 3; i<N; i++){ # loop 3B
2227 : : x = q[i]; # DR_MISALIGNMENT(q) = 0
2228 : : p[i] = y; # DR_MISALIGNMENT(p) = unaligned
2229 : : }
2230 : : }
2231 : :
2232 : : These loops are later passed to loop_transform to be vectorized. The
2233 : : vectorizer will use the alignment information to guide the transformation
2234 : : (whether to generate regular loads/stores, or with special handling for
2235 : : misalignment). */
2236 : :
2237 : : opt_result
2238 : 277138 : vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
2239 : : {
2240 : 277138 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2241 : 277138 : dr_vec_info *first_store = NULL;
2242 : 277138 : dr_vec_info *dr0_info = NULL;
2243 : 277138 : struct data_reference *dr;
2244 : 277138 : unsigned int i;
2245 : 277138 : bool do_peeling = false;
2246 : 277138 : bool do_versioning = false;
2247 : 277138 : unsigned int npeel = 0;
2248 : 277138 : bool one_misalignment_known = false;
2249 : 277138 : bool one_misalignment_unknown = false;
2250 : 277138 : bool one_dr_unsupportable = false;
2251 : 277138 : dr_vec_info *unsupportable_dr_info = NULL;
2252 : 277138 : unsigned int dr0_same_align_drs = 0, first_store_same_align_drs = 0;
2253 : 277138 : hash_table<peel_info_hasher> peeling_htab (1);
2254 : :
2255 : 277138 : DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
2256 : :
2257 : : /* Reset data so we can safely be called multiple times. */
2258 : 277138 : LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2259 : 277138 : LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
2260 : :
2261 : 277138 : if (LOOP_VINFO_DATAREFS (loop_vinfo).is_empty ())
2262 : 12650 : return opt_result::success ();
2263 : :
2264 : : /* Sort the vector of datarefs so DRs that have the same or dependent
2265 : : alignment are next to each other. */
2266 : 264488 : auto_vec<data_reference_p> datarefs
2267 : 264488 : = LOOP_VINFO_DATAREFS (loop_vinfo).copy ();
2268 : 264488 : datarefs.qsort (dr_align_group_sort_cmp);
2269 : :
2270 : : /* Compute the number of DRs that become aligned when we peel
2271 : : a dataref so it becomes aligned. */
2272 : 528976 : auto_vec<unsigned> n_same_align_refs (datarefs.length ());
2273 : 264488 : n_same_align_refs.quick_grow_cleared (datarefs.length ());
2274 : 264488 : unsigned i0;
2275 : 539713 : for (i0 = 0; i0 < datarefs.length (); ++i0)
2276 : 272261 : if (DR_BASE_ADDRESS (datarefs[i0]))
2277 : : break;
2278 : 1680654 : for (i = i0 + 1; i <= datarefs.length (); ++i)
2279 : : {
2280 : 575839 : if (i == datarefs.length ()
2281 : 314315 : || !operand_equal_p (DR_BASE_ADDRESS (datarefs[i0]),
2282 : 314315 : DR_BASE_ADDRESS (datarefs[i]), 0)
2283 : 151255 : || !operand_equal_p (DR_OFFSET (datarefs[i0]),
2284 : 151255 : DR_OFFSET (datarefs[i]), 0)
2285 : 726080 : || !operand_equal_p (DR_STEP (datarefs[i0]),
2286 : 150241 : DR_STEP (datarefs[i]), 0))
2287 : : {
2288 : : /* The subgroup [i0, i-1] now only differs in DR_INIT and
2289 : : possibly DR_TARGET_ALIGNMENT. Still the whole subgroup
2290 : : will get known misalignment if we align one of the refs
2291 : : with the largest DR_TARGET_ALIGNMENT. */
2292 : 1001872 : for (unsigned j = i0; j < i; ++j)
2293 : : {
2294 : 575839 : dr_vec_info *dr_infoj = loop_vinfo->lookup_dr (datarefs[j]);
2295 : 4210464 : for (unsigned k = i0; k < i; ++k)
2296 : : {
2297 : 3634625 : if (k == j)
2298 : 575839 : continue;
2299 : 3058786 : dr_vec_info *dr_infok = loop_vinfo->lookup_dr (datarefs[k]);
2300 : 3058786 : if (vect_dr_aligned_if_related_peeled_dr_is (dr_infok,
2301 : : dr_infoj))
2302 : 1366619 : n_same_align_refs[j]++;
2303 : : }
2304 : : }
2305 : : i0 = i;
2306 : : }
2307 : : }
2308 : :
2309 : : /* While cost model enhancements are expected in the future, the high level
2310 : : view of the code at this time is as follows:
2311 : :
2312 : : A) If there is a misaligned access then see if peeling to align
2313 : : this access can make all data references satisfy
2314 : : vect_supportable_dr_alignment. If so, update data structures
2315 : : as needed and return true.
2316 : :
2317 : : B) If peeling wasn't possible and there is a data reference with an
2318 : : unknown misalignment that does not satisfy vect_supportable_dr_alignment
2319 : : then see if loop versioning checks can be used to make all data
2320 : : references satisfy vect_supportable_dr_alignment. If so, update
2321 : : data structures as needed and return true.
2322 : :
2323 : : C) If neither peeling nor versioning were successful then return false if
2324 : : any data reference does not satisfy vect_supportable_dr_alignment.
2325 : :
2326 : : D) Return true (all data references satisfy vect_supportable_dr_alignment).
2327 : :
2328 : : Note, Possibility 3 above (which is peeling and versioning together) is not
2329 : : being done at this time. */
2330 : :
2331 : : /* (1) Peeling to force alignment. */
2332 : :
2333 : : /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
2334 : : Considerations:
2335 : : + How many accesses will become aligned due to the peeling
2336 : : - How many accesses will become unaligned due to the peeling,
2337 : : and the cost of misaligned accesses.
2338 : : - The cost of peeling (the extra runtime checks, the increase
2339 : : in code size). */
2340 : :
2341 : 744862 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2342 : : {
2343 : 509214 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2344 : 509214 : if (!vect_relevant_for_alignment_p (dr_info))
2345 : 82573 : continue;
2346 : :
2347 : 426641 : stmt_vec_info stmt_info = dr_info->stmt;
2348 : 426641 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2349 : 426641 : do_peeling = vector_alignment_reachable_p (dr_info);
2350 : 426641 : if (do_peeling)
2351 : : {
2352 : 383263 : if (known_alignment_for_access_p (dr_info, vectype))
2353 : : {
2354 : 217094 : unsigned int npeel_tmp = 0;
2355 : 217094 : bool negative = tree_int_cst_compare (DR_STEP (dr),
2356 : 217094 : size_zero_node) < 0;
2357 : :
2358 : : /* If known_alignment_for_access_p then we have set
2359 : : DR_MISALIGNMENT which is only done if we know it at compiler
2360 : : time, so it is safe to assume target alignment is constant.
2361 : : */
2362 : 217094 : unsigned int target_align =
2363 : 217094 : DR_TARGET_ALIGNMENT (dr_info).to_constant ();
2364 : 217094 : unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (dr_info);
2365 : 217094 : poly_int64 off = 0;
2366 : 217094 : if (negative)
2367 : 1932 : off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2368 : 217094 : unsigned int mis = dr_misalignment (dr_info, vectype, off);
2369 : 217094 : mis = negative ? mis : -mis;
2370 : 217094 : if (mis != 0)
2371 : 10336 : npeel_tmp = (mis & (target_align - 1)) / dr_size;
2372 : :
2373 : : /* For multiple types, it is possible that the bigger type access
2374 : : will have more than one peeling option. E.g., a loop with two
2375 : : types: one of size (vector size / 4), and the other one of
2376 : : size (vector size / 8). Vectorization factor will 8. If both
2377 : : accesses are misaligned by 3, the first one needs one scalar
2378 : : iteration to be aligned, and the second one needs 5. But the
2379 : : first one will be aligned also by peeling 5 scalar
2380 : : iterations, and in that case both accesses will be aligned.
2381 : : Hence, except for the immediate peeling amount, we also want
2382 : : to try to add full vector size, while we don't exceed
2383 : : vectorization factor.
2384 : : We do this automatically for cost model, since we calculate
2385 : : cost for every peeling option. */
2386 : 217094 : poly_uint64 nscalars = npeel_tmp;
2387 : 217094 : if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2388 : : {
2389 : 41109 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2390 : 41109 : unsigned group_size = 1;
2391 : 41109 : if (STMT_SLP_TYPE (stmt_info)
2392 : 41109 : && STMT_VINFO_GROUPED_ACCESS (stmt_info))
2393 : 1638 : group_size = DR_GROUP_SIZE (stmt_info);
2394 : 41109 : nscalars = vf * group_size;
2395 : : }
2396 : :
2397 : : /* Save info about DR in the hash table. Also include peeling
2398 : : amounts according to the explanation above. Indicate
2399 : : the alignment status when the ref is not aligned.
2400 : : ??? Rather than using unknown alignment here we should
2401 : : prune all entries from the peeling hashtable which cause
2402 : : DRs to be not supported. */
2403 : 217094 : bool supportable_if_not_aligned
2404 : : = vect_supportable_dr_alignment
2405 : 217094 : (loop_vinfo, dr_info, vectype, DR_MISALIGNMENT_UNKNOWN);
2406 : 497788 : while (known_le (npeel_tmp, nscalars))
2407 : : {
2408 : 280694 : vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
2409 : : dr_info, npeel_tmp,
2410 : : supportable_if_not_aligned);
2411 : 280694 : npeel_tmp += MAX (1, target_align / dr_size);
2412 : : }
2413 : :
2414 : 217094 : one_misalignment_known = true;
2415 : : }
2416 : : else
2417 : : {
2418 : : /* If we don't know any misalignment values, we prefer
2419 : : peeling for data-ref that has the maximum number of data-refs
2420 : : with the same alignment, unless the target prefers to align
2421 : : stores over load. */
2422 : 166169 : unsigned same_align_drs = n_same_align_refs[i];
2423 : 166169 : if (!dr0_info
2424 : 166169 : || dr0_same_align_drs < same_align_drs)
2425 : : {
2426 : : dr0_same_align_drs = same_align_drs;
2427 : : dr0_info = dr_info;
2428 : : }
2429 : : /* For data-refs with the same number of related
2430 : : accesses prefer the one where the misalign
2431 : : computation will be invariant in the outermost loop. */
2432 : 57368 : else if (dr0_same_align_drs == same_align_drs)
2433 : : {
2434 : 56458 : class loop *ivloop0, *ivloop;
2435 : 56458 : ivloop0 = outermost_invariant_loop_for_expr
2436 : 56458 : (loop, DR_BASE_ADDRESS (dr0_info->dr));
2437 : 56458 : ivloop = outermost_invariant_loop_for_expr
2438 : 56458 : (loop, DR_BASE_ADDRESS (dr));
2439 : 56458 : if ((ivloop && !ivloop0)
2440 : 56458 : || (ivloop && ivloop0
2441 : 56452 : && flow_loop_nested_p (ivloop, ivloop0)))
2442 : : dr0_info = dr_info;
2443 : : }
2444 : :
2445 : 166169 : one_misalignment_unknown = true;
2446 : :
2447 : : /* Check for data refs with unsupportable alignment that
2448 : : can be peeled. */
2449 : 166169 : enum dr_alignment_support supportable_dr_alignment
2450 : 166169 : = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2451 : : DR_MISALIGNMENT_UNKNOWN);
2452 : 166169 : if (supportable_dr_alignment == dr_unaligned_unsupported)
2453 : : {
2454 : 62414 : one_dr_unsupportable = true;
2455 : 62414 : unsupportable_dr_info = dr_info;
2456 : : }
2457 : :
2458 : 166169 : if (!first_store && DR_IS_WRITE (dr))
2459 : : {
2460 : 42244 : first_store = dr_info;
2461 : 42244 : first_store_same_align_drs = same_align_drs;
2462 : : }
2463 : : }
2464 : : }
2465 : : else
2466 : : {
2467 : 43378 : if (!aligned_access_p (dr_info, vectype))
2468 : : {
2469 : 28840 : if (dump_enabled_p ())
2470 : 1670 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471 : : "vector alignment may not be reachable\n");
2472 : : break;
2473 : : }
2474 : : }
2475 : : }
2476 : :
2477 : : /* Check if we can possibly peel the loop. */
2478 : 264488 : if (!vect_can_advance_ivs_p (loop_vinfo)
2479 : 258249 : || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
2480 : 258249 : loop_preheader_edge (loop))
2481 : 258249 : || loop->inner
2482 : 521136 : || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
2483 : : do_peeling = false;
2484 : :
2485 : 264488 : struct _vect_peel_extended_info peel_for_known_alignment;
2486 : 264488 : struct _vect_peel_extended_info peel_for_unknown_alignment;
2487 : 264488 : struct _vect_peel_extended_info best_peel;
2488 : :
2489 : 264488 : peel_for_unknown_alignment.inside_cost = INT_MAX;
2490 : 264488 : peel_for_unknown_alignment.outside_cost = INT_MAX;
2491 : 264488 : peel_for_unknown_alignment.peel_info.count = 0;
2492 : :
2493 : 264488 : if (do_peeling
2494 : 264488 : && one_misalignment_unknown)
2495 : : {
2496 : : /* Check if the target requires to prefer stores over loads, i.e., if
2497 : : misaligned stores are more expensive than misaligned loads (taking
2498 : : drs with same alignment into account). */
2499 : 94370 : unsigned int load_inside_cost = 0;
2500 : 94370 : unsigned int load_outside_cost = 0;
2501 : 94370 : unsigned int store_inside_cost = 0;
2502 : 94370 : unsigned int store_outside_cost = 0;
2503 : 94370 : unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
2504 : :
2505 : 94370 : stmt_vector_for_cost dummy;
2506 : 94370 : dummy.create (2);
2507 : 94370 : vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
2508 : : &load_inside_cost,
2509 : : &load_outside_cost,
2510 : : &dummy, &dummy, estimated_npeels);
2511 : 94370 : dummy.release ();
2512 : :
2513 : 94370 : if (first_store)
2514 : : {
2515 : 34101 : dummy.create (2);
2516 : 34101 : vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
2517 : : &store_inside_cost,
2518 : : &store_outside_cost,
2519 : : &dummy, &dummy,
2520 : : estimated_npeels);
2521 : 34101 : dummy.release ();
2522 : : }
2523 : : else
2524 : : {
2525 : 60269 : store_inside_cost = INT_MAX;
2526 : 60269 : store_outside_cost = INT_MAX;
2527 : : }
2528 : :
2529 : 94370 : if (load_inside_cost > store_inside_cost
2530 : 94370 : || (load_inside_cost == store_inside_cost
2531 : 33942 : && load_outside_cost > store_outside_cost))
2532 : : {
2533 : 94370 : dr0_info = first_store;
2534 : 94370 : dr0_same_align_drs = first_store_same_align_drs;
2535 : 94370 : peel_for_unknown_alignment.inside_cost = store_inside_cost;
2536 : 94370 : peel_for_unknown_alignment.outside_cost = store_outside_cost;
2537 : : }
2538 : : else
2539 : : {
2540 : 94370 : peel_for_unknown_alignment.inside_cost = load_inside_cost;
2541 : 94370 : peel_for_unknown_alignment.outside_cost = load_outside_cost;
2542 : : }
2543 : :
2544 : 94370 : stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2545 : 94370 : prologue_cost_vec.create (2);
2546 : 94370 : epilogue_cost_vec.create (2);
2547 : :
2548 : 94370 : int dummy2;
2549 : 188740 : peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
2550 : 94370 : (loop_vinfo, estimated_npeels, &dummy2,
2551 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2552 : : &prologue_cost_vec, &epilogue_cost_vec);
2553 : :
2554 : 94370 : prologue_cost_vec.release ();
2555 : 94370 : epilogue_cost_vec.release ();
2556 : :
2557 : 94370 : peel_for_unknown_alignment.peel_info.count = dr0_same_align_drs + 1;
2558 : : }
2559 : :
2560 : 264488 : peel_for_unknown_alignment.peel_info.npeel = 0;
2561 : 264488 : peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
2562 : :
2563 : 264488 : best_peel = peel_for_unknown_alignment;
2564 : :
2565 : 264488 : peel_for_known_alignment.inside_cost = INT_MAX;
2566 : 264488 : peel_for_known_alignment.outside_cost = INT_MAX;
2567 : 264488 : peel_for_known_alignment.peel_info.count = 0;
2568 : 264488 : peel_for_known_alignment.peel_info.dr_info = NULL;
2569 : :
2570 : 264488 : if (do_peeling && one_misalignment_known)
2571 : : {
2572 : : /* Peeling is possible, but there is no data access that is not supported
2573 : : unless aligned. So we try to choose the best possible peeling from
2574 : : the hash table. */
2575 : 120277 : peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
2576 : 120277 : (&peeling_htab, loop_vinfo);
2577 : : }
2578 : :
2579 : : /* Compare costs of peeling for known and unknown alignment. */
2580 : 264488 : if (peel_for_known_alignment.peel_info.dr_info != NULL
2581 : 120277 : && peel_for_unknown_alignment.inside_cost
2582 : : >= peel_for_known_alignment.inside_cost)
2583 : : {
2584 : 113770 : best_peel = peel_for_known_alignment;
2585 : :
2586 : : /* If the best peeling for known alignment has NPEEL == 0, perform no
2587 : : peeling at all except if there is an unsupportable dr that we can
2588 : : align. */
2589 : 113770 : if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
2590 : : do_peeling = false;
2591 : : }
2592 : :
2593 : : /* If there is an unsupportable data ref, prefer this over all choices so far
2594 : : since we'd have to discard a chosen peeling except when it accidentally
2595 : : aligned the unsupportable data ref. */
2596 : 155715 : if (one_dr_unsupportable)
2597 : : dr0_info = unsupportable_dr_info;
2598 : 216374 : else if (do_peeling)
2599 : : {
2600 : : /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
2601 : : TODO: Use nopeel_outside_cost or get rid of it? */
2602 : 49352 : unsigned nopeel_inside_cost = 0;
2603 : 49352 : unsigned nopeel_outside_cost = 0;
2604 : :
2605 : 49352 : stmt_vector_for_cost dummy;
2606 : 49352 : dummy.create (2);
2607 : 49352 : vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2608 : : &nopeel_outside_cost, &dummy, &dummy, 0);
2609 : 49352 : dummy.release ();
2610 : :
2611 : : /* Add epilogue costs. As we do not peel for alignment here, no prologue
2612 : : costs will be recorded. */
2613 : 49352 : stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2614 : 49352 : prologue_cost_vec.create (2);
2615 : 49352 : epilogue_cost_vec.create (2);
2616 : :
2617 : 49352 : int dummy2;
2618 : 98704 : nopeel_outside_cost += vect_get_known_peeling_cost
2619 : 49352 : (loop_vinfo, 0, &dummy2,
2620 : : &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2621 : : &prologue_cost_vec, &epilogue_cost_vec);
2622 : :
2623 : 49352 : prologue_cost_vec.release ();
2624 : 49352 : epilogue_cost_vec.release ();
2625 : :
2626 : 49352 : npeel = best_peel.peel_info.npeel;
2627 : 49352 : dr0_info = best_peel.peel_info.dr_info;
2628 : :
2629 : : /* If no peeling is not more expensive than the best peeling we
2630 : : have so far, don't perform any peeling. */
2631 : 49352 : if (nopeel_inside_cost <= best_peel.inside_cost)
2632 : 47880 : do_peeling = false;
2633 : : }
2634 : :
2635 : 97466 : if (do_peeling)
2636 : : {
2637 : 44231 : stmt_vec_info stmt_info = dr0_info->stmt;
2638 : 44231 : if (known_alignment_for_access_p (dr0_info,
2639 : : STMT_VINFO_VECTYPE (stmt_info)))
2640 : : {
2641 : 1451 : bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2642 : 1451 : size_zero_node) < 0;
2643 : 1451 : if (!npeel)
2644 : : {
2645 : : /* Since it's known at compile time, compute the number of
2646 : : iterations in the peeled loop (the peeling factor) for use in
2647 : : updating DR_MISALIGNMENT values. The peeling factor is the
2648 : : vectorization factor minus the misalignment as an element
2649 : : count. */
2650 : 0 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2651 : 0 : poly_int64 off = 0;
2652 : 0 : if (negative)
2653 : 0 : off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2654 : 0 : * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2655 : 0 : unsigned int mis
2656 : 0 : = dr_misalignment (dr0_info, vectype, off);
2657 : 0 : mis = negative ? mis : -mis;
2658 : : /* If known_alignment_for_access_p then we have set
2659 : : DR_MISALIGNMENT which is only done if we know it at compiler
2660 : : time, so it is safe to assume target alignment is constant.
2661 : : */
2662 : 0 : unsigned int target_align =
2663 : 0 : DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2664 : 0 : npeel = ((mis & (target_align - 1))
2665 : 0 : / vect_get_scalar_dr_size (dr0_info));
2666 : : }
2667 : :
2668 : : /* For interleaved data access every iteration accesses all the
2669 : : members of the group, therefore we divide the number of iterations
2670 : : by the group size. */
2671 : 1451 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2672 : 37 : npeel /= DR_GROUP_SIZE (stmt_info);
2673 : :
2674 : 1451 : if (dump_enabled_p ())
2675 : 254 : dump_printf_loc (MSG_NOTE, vect_location,
2676 : : "Try peeling by %d\n", npeel);
2677 : : }
2678 : :
2679 : : /* Ensure that all datarefs can be vectorized after the peel. */
2680 : 44231 : if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2681 : : do_peeling = false;
2682 : :
2683 : : /* Check if all datarefs are supportable and log. */
2684 : 44231 : if (do_peeling
2685 : 44231 : && npeel == 0
2686 : 44231 : && known_alignment_for_access_p (dr0_info,
2687 : : STMT_VINFO_VECTYPE (stmt_info)))
2688 : 3 : return opt_result::success ();
2689 : :
2690 : : /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
2691 : 44228 : if (do_peeling)
2692 : : {
2693 : 27995 : unsigned max_allowed_peel
2694 : 27995 : = param_vect_max_peeling_for_alignment;
2695 : 27995 : if (loop_cost_model (loop) <= VECT_COST_MODEL_CHEAP)
2696 : : max_allowed_peel = 0;
2697 : 6423 : if (max_allowed_peel != (unsigned)-1)
2698 : : {
2699 : 21575 : unsigned max_peel = npeel;
2700 : 21575 : if (max_peel == 0)
2701 : : {
2702 : 20930 : poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2703 : 20930 : unsigned HOST_WIDE_INT target_align_c;
2704 : 20930 : if (target_align.is_constant (&target_align_c))
2705 : 41860 : max_peel =
2706 : 20930 : target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2707 : : else
2708 : : {
2709 : : do_peeling = false;
2710 : : if (dump_enabled_p ())
2711 : : dump_printf_loc (MSG_NOTE, vect_location,
2712 : : "Disable peeling, max peels set and vector"
2713 : : " alignment unknown\n");
2714 : : }
2715 : : }
2716 : 21575 : if (max_peel > max_allowed_peel)
2717 : : {
2718 : 21575 : do_peeling = false;
2719 : 21575 : if (dump_enabled_p ())
2720 : 51 : dump_printf_loc (MSG_NOTE, vect_location,
2721 : : "Disable peeling, max peels reached: %d\n", max_peel);
2722 : : }
2723 : : }
2724 : : }
2725 : :
2726 : : /* Cost model #2 - if peeling may result in a remaining loop not
2727 : : iterating enough to be vectorized then do not peel. Since this
2728 : : is a cost heuristic rather than a correctness decision, use the
2729 : : most likely runtime value for variable vectorization factors. */
2730 : 51 : if (do_peeling
2731 : 6420 : && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2732 : : {
2733 : 2698 : unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2734 : 2698 : unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2735 : 2698 : if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2736 : 2698 : < assumed_vf + max_peel)
2737 : : do_peeling = false;
2738 : : }
2739 : :
2740 : : if (do_peeling)
2741 : : {
2742 : : /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2743 : : If the misalignment of DR_i is identical to that of dr0 then set
2744 : : DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2745 : : dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2746 : : by the peeling factor times the element size of DR_i (MOD the
2747 : : vectorization factor times the size). Otherwise, the
2748 : : misalignment of DR_i must be set to unknown. */
2749 : 13152 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2750 : 7405 : if (dr != dr0_info->dr)
2751 : : {
2752 : 1658 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2753 : 1658 : if (!vect_relevant_for_alignment_p (dr_info))
2754 : 52 : continue;
2755 : :
2756 : 1606 : vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2757 : : }
2758 : :
2759 : 5747 : LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2760 : 5747 : if (npeel)
2761 : 479 : LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2762 : : else
2763 : 5268 : LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = -1;
2764 : 5747 : SET_DR_MISALIGNMENT (dr0_info,
2765 : : vect_dr_misalign_for_aligned_access (dr0_info));
2766 : 5747 : if (dump_enabled_p ())
2767 : : {
2768 : 273 : dump_printf_loc (MSG_NOTE, vect_location,
2769 : : "Alignment of access forced using peeling.\n");
2770 : 273 : dump_printf_loc (MSG_NOTE, vect_location,
2771 : : "Peeling for alignment will be applied.\n");
2772 : : }
2773 : :
2774 : : /* The inside-loop cost will be accounted for in vectorizable_load
2775 : : and vectorizable_store correctly with adjusted alignments.
2776 : : Drop the body_cst_vec on the floor here. */
2777 : 5747 : return opt_result::success ();
2778 : : }
2779 : : }
2780 : :
2781 : : /* (2) Versioning to force alignment. */
2782 : :
2783 : : /* Try versioning if:
2784 : : 1) optimize loop for speed and the cost-model is not cheap
2785 : : 2) there is at least one unsupported misaligned data ref with an unknown
2786 : : misalignment, and
2787 : : 3) all misaligned data refs with a known misalignment are supported, and
2788 : : 4) the number of runtime alignment checks is within reason. */
2789 : :
2790 : 258738 : do_versioning
2791 : 258738 : = (optimize_loop_nest_for_speed_p (loop)
2792 : 258295 : && !loop->inner /* FORNOW */
2793 : 515432 : && loop_cost_model (loop) > VECT_COST_MODEL_CHEAP);
2794 : :
2795 : : if (do_versioning)
2796 : : {
2797 : 278904 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2798 : : {
2799 : 210820 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2800 : 210820 : if (!vect_relevant_for_alignment_p (dr_info))
2801 : 152290 : continue;
2802 : :
2803 : 148845 : stmt_vec_info stmt_info = dr_info->stmt;
2804 : 148845 : if (STMT_VINFO_STRIDED_P (stmt_info))
2805 : : {
2806 : : do_versioning = false;
2807 : 1146 : break;
2808 : : }
2809 : :
2810 : 147947 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2811 : 147947 : bool negative = tree_int_cst_compare (DR_STEP (dr),
2812 : 147947 : size_zero_node) < 0;
2813 : 147947 : poly_int64 off = 0;
2814 : 147947 : if (negative)
2815 : 2811 : off = ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
2816 : 2811 : * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
2817 : 147947 : int misalignment;
2818 : 147947 : if ((misalignment = dr_misalignment (dr_info, vectype, off)) == 0)
2819 : 90315 : continue;
2820 : :
2821 : 57632 : enum dr_alignment_support supportable_dr_alignment
2822 : 57632 : = vect_supportable_dr_alignment (loop_vinfo, dr_info, vectype,
2823 : : misalignment);
2824 : 57632 : if (supportable_dr_alignment == dr_unaligned_unsupported)
2825 : : {
2826 : 8414 : if (misalignment != DR_MISALIGNMENT_UNKNOWN
2827 : 8414 : || (LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2828 : 8278 : >= (unsigned) param_vect_max_version_for_alignment_checks))
2829 : : {
2830 : : do_versioning = false;
2831 : 1146 : break;
2832 : : }
2833 : :
2834 : : /* At present we don't support versioning for alignment
2835 : : with variable VF, since there's no guarantee that the
2836 : : VF is a power of two. We could relax this if we added
2837 : : a way of enforcing a power-of-two size. */
2838 : 8166 : unsigned HOST_WIDE_INT size;
2839 : 16332 : if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2840 : : {
2841 : : do_versioning = false;
2842 : : break;
2843 : : }
2844 : :
2845 : : /* Forcing alignment in the first iteration is no good if
2846 : : we don't keep it across iterations. For now, just disable
2847 : : versioning in this case.
2848 : : ?? We could actually unroll the loop to achieve the required
2849 : : overall step alignment, and forcing the alignment could be
2850 : : done by doing some iterations of the non-vectorized loop. */
2851 : 8166 : if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2852 : 8166 : * DR_STEP_ALIGNMENT (dr),
2853 : 8166 : DR_TARGET_ALIGNMENT (dr_info)))
2854 : : {
2855 : : do_versioning = false;
2856 : : break;
2857 : : }
2858 : :
2859 : : /* The rightmost bits of an aligned address must be zeros.
2860 : : Construct the mask needed for this test. For example,
2861 : : GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2862 : : mask must be 15 = 0xf. */
2863 : 8166 : int mask = size - 1;
2864 : :
2865 : : /* FORNOW: use the same mask to test all potentially unaligned
2866 : : references in the loop. */
2867 : 8166 : if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2868 : 5910 : && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2869 : : {
2870 : : do_versioning = false;
2871 : : break;
2872 : : }
2873 : :
2874 : 8166 : LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2875 : 8166 : LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2876 : : }
2877 : : }
2878 : :
2879 : : /* Versioning requires at least one misaligned data reference. */
2880 : 69230 : if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2881 : : do_versioning = false;
2882 : 4452 : else if (!do_versioning)
2883 : 112 : LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2884 : : }
2885 : :
2886 : 112 : if (do_versioning)
2887 : : {
2888 : : const vec<stmt_vec_info> &may_misalign_stmts
2889 : : = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2890 : : stmt_vec_info stmt_info;
2891 : :
2892 : : /* It can now be assumed that the data references in the statements
2893 : : in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2894 : : of the loop being vectorized. */
2895 : 11834 : FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2896 : : {
2897 : 7494 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2898 : 7494 : SET_DR_MISALIGNMENT (dr_info,
2899 : : vect_dr_misalign_for_aligned_access (dr_info));
2900 : 7494 : if (dump_enabled_p ())
2901 : 124 : dump_printf_loc (MSG_NOTE, vect_location,
2902 : : "Alignment of access forced using versioning.\n");
2903 : : }
2904 : :
2905 : 4340 : if (dump_enabled_p ())
2906 : 77 : dump_printf_loc (MSG_NOTE, vect_location,
2907 : : "Versioning for alignment will be applied.\n");
2908 : :
2909 : : /* Peeling and versioning can't be done together at this time. */
2910 : 4340 : gcc_assert (! (do_peeling && do_versioning));
2911 : :
2912 : 4340 : return opt_result::success ();
2913 : : }
2914 : :
2915 : : /* This point is reached if neither peeling nor versioning is being done. */
2916 : 254398 : gcc_assert (! (do_peeling || do_versioning));
2917 : :
2918 : 254398 : return opt_result::success ();
2919 : 541626 : }
2920 : :
2921 : :
2922 : : /* Function vect_analyze_data_refs_alignment
2923 : :
2924 : : Analyze the alignment of the data-references in the loop.
2925 : : Return FALSE if a data reference is found that cannot be vectorized. */
2926 : :
2927 : : opt_result
2928 : 321862 : vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo)
2929 : : {
2930 : 321862 : DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2931 : :
2932 : 321862 : vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2933 : 321862 : struct data_reference *dr;
2934 : 321862 : unsigned int i;
2935 : :
2936 : 321862 : vect_record_base_alignments (loop_vinfo);
2937 : 1025515 : FOR_EACH_VEC_ELT (datarefs, i, dr)
2938 : : {
2939 : 710906 : dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2940 : 710906 : if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2941 : : {
2942 : 710906 : if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
2943 : 912937 : && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
2944 : 106115 : continue;
2945 : 604791 : opt_result res = opt_result::success ();
2946 : 604791 : vect_compute_data_ref_alignment (loop_vinfo, dr_info,
2947 : : STMT_VINFO_VECTYPE (dr_info->stmt),
2948 : : &res);
2949 : 604791 : if (!res)
2950 : 7253 : return res;
2951 : : }
2952 : : }
2953 : :
2954 : 314609 : return opt_result::success ();
2955 : : }
2956 : :
2957 : :
2958 : : /* Analyze alignment of DRs of stmts in NODE. */
2959 : :
2960 : : static bool
2961 : 818107 : vect_slp_analyze_node_alignment (vec_info *vinfo, slp_tree node)
2962 : : {
2963 : : /* Alignment is maintained in the first element of the group. */
2964 : 818107 : stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2965 : 818107 : first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2966 : 818107 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2967 : 818107 : tree vectype = SLP_TREE_VECTYPE (node);
2968 : 818107 : poly_uint64 vector_alignment
2969 : 818107 : = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
2970 : : BITS_PER_UNIT);
2971 : 818107 : if (dr_info->misalignment == DR_MISALIGNMENT_UNINITIALIZED)
2972 : 791141 : vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2973 : : /* Re-analyze alignment when we're facing a vectorization with a bigger
2974 : : alignment requirement. */
2975 : 26966 : else if (known_lt (dr_info->target_alignment, vector_alignment))
2976 : : {
2977 : 78 : poly_uint64 old_target_alignment = dr_info->target_alignment;
2978 : 78 : int old_misalignment = dr_info->misalignment;
2979 : 78 : vect_compute_data_ref_alignment (vinfo, dr_info, SLP_TREE_VECTYPE (node));
2980 : : /* But keep knowledge about a smaller alignment. */
2981 : 78 : if (old_misalignment != DR_MISALIGNMENT_UNKNOWN
2982 : 47 : && dr_info->misalignment == DR_MISALIGNMENT_UNKNOWN)
2983 : : {
2984 : 1 : dr_info->target_alignment = old_target_alignment;
2985 : 1 : dr_info->misalignment = old_misalignment;
2986 : : }
2987 : : }
2988 : : /* When we ever face unordered target alignments the first one wins in terms
2989 : : of analyzing and the other will become unknown in dr_misalignment. */
2990 : 818107 : return true;
2991 : : }
2992 : :
2993 : : /* Function vect_slp_analyze_instance_alignment
2994 : :
2995 : : Analyze the alignment of the data-references in the SLP instance.
2996 : : Return FALSE if a data reference is found that cannot be vectorized. */
2997 : :
2998 : : bool
2999 : 763759 : vect_slp_analyze_instance_alignment (vec_info *vinfo,
3000 : : slp_instance instance)
3001 : : {
3002 : 763759 : DUMP_VECT_SCOPE ("vect_slp_analyze_instance_alignment");
3003 : :
3004 : 763759 : slp_tree node;
3005 : 763759 : unsigned i;
3006 : 929178 : FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
3007 : 165419 : if (! vect_slp_analyze_node_alignment (vinfo, node))
3008 : : return false;
3009 : :
3010 : 763759 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
3011 : 763759 : && ! vect_slp_analyze_node_alignment
3012 : 652688 : (vinfo, SLP_INSTANCE_TREE (instance)))
3013 : : return false;
3014 : :
3015 : : return true;
3016 : : }
3017 : :
3018 : :
3019 : : /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3020 : : accesses of legal size, step, etc. Detect gaps, single element
3021 : : interleaving, and other special cases. Set grouped access info.
3022 : : Collect groups of strided stores for further use in SLP analysis.
3023 : : Worker for vect_analyze_group_access. */
3024 : :
3025 : : static bool
3026 : 11752751 : vect_analyze_group_access_1 (vec_info *vinfo, dr_vec_info *dr_info)
3027 : : {
3028 : 11752751 : data_reference *dr = dr_info->dr;
3029 : 11752751 : tree step = DR_STEP (dr);
3030 : 11752751 : tree scalar_type = TREE_TYPE (DR_REF (dr));
3031 : 11752751 : HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
3032 : 11752751 : stmt_vec_info stmt_info = dr_info->stmt;
3033 : 11752751 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3034 : 11752751 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
3035 : 11752751 : HOST_WIDE_INT dr_step = -1;
3036 : 11752751 : HOST_WIDE_INT groupsize, last_accessed_element = 1;
3037 : 11752751 : bool slp_impossible = false;
3038 : :
3039 : : /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
3040 : : size of the interleaving group (including gaps). */
3041 : 11752751 : if (tree_fits_shwi_p (step))
3042 : : {
3043 : 11744318 : dr_step = tree_to_shwi (step);
3044 : : /* Check that STEP is a multiple of type size. Otherwise there is
3045 : : a non-element-sized gap at the end of the group which we
3046 : : cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
3047 : : ??? As we can handle non-constant step fine here we should
3048 : : simply remove uses of DR_GROUP_GAP between the last and first
3049 : : element and instead rely on DR_STEP. DR_GROUP_SIZE then would
3050 : : simply not include that gap. */
3051 : 11744318 : if ((dr_step % type_size) != 0)
3052 : : {
3053 : 478 : if (dump_enabled_p ())
3054 : 27 : dump_printf_loc (MSG_NOTE, vect_location,
3055 : : "Step %T is not a multiple of the element size"
3056 : : " for %T\n",
3057 : : step, DR_REF (dr));
3058 : 478 : return false;
3059 : : }
3060 : 11743840 : groupsize = absu_hwi (dr_step) / type_size;
3061 : : }
3062 : : else
3063 : : groupsize = 0;
3064 : :
3065 : : /* Not consecutive access is possible only if it is a part of interleaving. */
3066 : 11752273 : if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
3067 : : {
3068 : : /* Check if it this DR is a part of interleaving, and is a single
3069 : : element of the group that is accessed in the loop. */
3070 : :
3071 : : /* Gaps are supported only for loads. STEP must be a multiple of the type
3072 : : size. */
3073 : 7904130 : if (DR_IS_READ (dr)
3074 : 4754807 : && (dr_step % type_size) == 0
3075 : : && groupsize > 0
3076 : : /* This could be UINT_MAX but as we are generating code in a very
3077 : : inefficient way we have to cap earlier.
3078 : : See PR91403 for example. */
3079 : 4754807 : && groupsize <= 4096)
3080 : : {
3081 : 56052 : DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
3082 : 56052 : DR_GROUP_SIZE (stmt_info) = groupsize;
3083 : 56052 : DR_GROUP_GAP (stmt_info) = groupsize - 1;
3084 : 56052 : if (dump_enabled_p ())
3085 : 942 : dump_printf_loc (MSG_NOTE, vect_location,
3086 : : "Detected single element interleaving %T"
3087 : : " step %T\n",
3088 : : DR_REF (dr), step);
3089 : :
3090 : 56052 : return true;
3091 : : }
3092 : :
3093 : 7848078 : if (dump_enabled_p ())
3094 : 3122 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3095 : : "not consecutive access %G", stmt_info->stmt);
3096 : :
3097 : 7848078 : if (bb_vinfo)
3098 : : {
3099 : : /* Mark the statement as unvectorizable. */
3100 : 7833936 : STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3101 : 7833936 : return true;
3102 : : }
3103 : :
3104 : 14142 : if (dump_enabled_p ())
3105 : 300 : dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
3106 : 14142 : STMT_VINFO_STRIDED_P (stmt_info) = true;
3107 : 14142 : return true;
3108 : : }
3109 : :
3110 : 3848143 : if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
3111 : : {
3112 : : /* First stmt in the interleaving chain. Check the chain. */
3113 : 1417567 : stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3114 : 1417567 : struct data_reference *data_ref = dr;
3115 : 1417567 : unsigned int count = 1;
3116 : 1417567 : tree prev_init = DR_INIT (data_ref);
3117 : 1417567 : HOST_WIDE_INT diff, gaps = 0;
3118 : :
3119 : : /* By construction, all group members have INTEGER_CST DR_INITs. */
3120 : 3850259 : while (next)
3121 : : {
3122 : : /* We never have the same DR multiple times. */
3123 : 2432764 : gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
3124 : : DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
3125 : :
3126 : 2432764 : data_ref = STMT_VINFO_DATA_REF (next);
3127 : :
3128 : : /* All group members have the same STEP by construction. */
3129 : 2432764 : gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
3130 : :
3131 : : /* Check that the distance between two accesses is equal to the type
3132 : : size. Otherwise, we have gaps. */
3133 : 2432764 : diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
3134 : 2432764 : - TREE_INT_CST_LOW (prev_init)) / type_size;
3135 : 2432764 : if (diff < 1 || diff > UINT_MAX)
3136 : : {
3137 : : /* For artificial testcases with array accesses with large
3138 : : constant indices we can run into overflow issues which
3139 : : can end up fooling the groupsize constraint below so
3140 : : check the individual gaps (which are represented as
3141 : : unsigned int) as well. */
3142 : 18 : if (dump_enabled_p ())
3143 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3144 : : "interleaved access with gap larger "
3145 : : "than representable\n");
3146 : 18 : return false;
3147 : : }
3148 : 2432746 : if (diff != 1)
3149 : : {
3150 : : /* FORNOW: SLP of accesses with gaps is not supported. */
3151 : 81546 : slp_impossible = true;
3152 : 81546 : if (DR_IS_WRITE (data_ref))
3153 : : {
3154 : 54 : if (dump_enabled_p ())
3155 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3156 : : "interleaved store with gaps\n");
3157 : 54 : return false;
3158 : : }
3159 : :
3160 : 81492 : gaps += diff - 1;
3161 : : }
3162 : :
3163 : 2432692 : last_accessed_element += diff;
3164 : :
3165 : : /* Store the gap from the previous member of the group. If there is no
3166 : : gap in the access, DR_GROUP_GAP is always 1. */
3167 : 2432692 : DR_GROUP_GAP (next) = diff;
3168 : :
3169 : 2432692 : prev_init = DR_INIT (data_ref);
3170 : 2432692 : next = DR_GROUP_NEXT_ELEMENT (next);
3171 : : /* Count the number of data-refs in the chain. */
3172 : 2432692 : count++;
3173 : : }
3174 : :
3175 : 1417495 : if (groupsize == 0)
3176 : 1360061 : groupsize = count + gaps;
3177 : :
3178 : : /* This could be UINT_MAX but as we are generating code in a very
3179 : : inefficient way we have to cap earlier. See PR78699 for example. */
3180 : 1417495 : if (groupsize > 4096)
3181 : : {
3182 : 295 : if (dump_enabled_p ())
3183 : 1 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3184 : : "group is too large\n");
3185 : 295 : return false;
3186 : : }
3187 : :
3188 : : /* Check that the size of the interleaving is equal to count for stores,
3189 : : i.e., that there are no gaps. */
3190 : 1417200 : if (groupsize != count
3191 : 84553 : && !DR_IS_READ (dr))
3192 : : {
3193 : 4510 : groupsize = count;
3194 : 4510 : STMT_VINFO_STRIDED_P (stmt_info) = true;
3195 : : }
3196 : :
3197 : : /* If there is a gap after the last load in the group it is the
3198 : : difference between the groupsize and the last accessed
3199 : : element.
3200 : : When there is no gap, this difference should be 0. */
3201 : 1417200 : DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
3202 : :
3203 : 1417200 : DR_GROUP_SIZE (stmt_info) = groupsize;
3204 : 1417200 : if (dump_enabled_p ())
3205 : : {
3206 : 7878 : dump_printf_loc (MSG_NOTE, vect_location,
3207 : : "Detected interleaving ");
3208 : 7878 : if (DR_IS_READ (dr))
3209 : 4212 : dump_printf (MSG_NOTE, "load ");
3210 : 3666 : else if (STMT_VINFO_STRIDED_P (stmt_info))
3211 : 571 : dump_printf (MSG_NOTE, "strided store ");
3212 : : else
3213 : 3095 : dump_printf (MSG_NOTE, "store ");
3214 : 7878 : dump_printf (MSG_NOTE, "of size %u\n",
3215 : : (unsigned)groupsize);
3216 : 7878 : dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
3217 : 7878 : next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3218 : 38095 : while (next)
3219 : : {
3220 : 30217 : if (DR_GROUP_GAP (next) != 1)
3221 : 309 : dump_printf_loc (MSG_NOTE, vect_location,
3222 : : "\t<gap of %d elements>\n",
3223 : 309 : DR_GROUP_GAP (next) - 1);
3224 : 30217 : dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
3225 : 30217 : next = DR_GROUP_NEXT_ELEMENT (next);
3226 : : }
3227 : 7878 : if (DR_GROUP_GAP (stmt_info) != 0)
3228 : 348 : dump_printf_loc (MSG_NOTE, vect_location,
3229 : : "\t<gap of %d elements>\n",
3230 : 348 : DR_GROUP_GAP (stmt_info));
3231 : : }
3232 : :
3233 : : /* SLP: create an SLP data structure for every interleaving group of
3234 : : stores for further analysis in vect_analyse_slp. */
3235 : 1417200 : if (DR_IS_WRITE (dr) && !slp_impossible)
3236 : : {
3237 : 849822 : if (loop_vinfo)
3238 : 18292 : LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
3239 : 849822 : if (bb_vinfo)
3240 : 831530 : BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
3241 : : }
3242 : : }
3243 : :
3244 : : return true;
3245 : : }
3246 : :
3247 : : /* Analyze groups of accesses: check that DR_INFO belongs to a group of
3248 : : accesses of legal size, step, etc. Detect gaps, single element
3249 : : interleaving, and other special cases. Set grouped access info.
3250 : : Collect groups of strided stores for further use in SLP analysis. */
3251 : :
3252 : : static bool
3253 : 11752751 : vect_analyze_group_access (vec_info *vinfo, dr_vec_info *dr_info)
3254 : : {
3255 : 11752751 : if (!vect_analyze_group_access_1 (vinfo, dr_info))
3256 : : {
3257 : : /* Dissolve the group if present. */
3258 : 845 : stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
3259 : 4457 : while (stmt_info)
3260 : : {
3261 : 3612 : stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
3262 : 3612 : DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3263 : 3612 : DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3264 : 3612 : stmt_info = next;
3265 : : }
3266 : : return false;
3267 : : }
3268 : : return true;
3269 : : }
3270 : :
3271 : : /* Analyze the access pattern of the data-reference DR_INFO.
3272 : : In case of non-consecutive accesses call vect_analyze_group_access() to
3273 : : analyze groups of accesses. */
3274 : :
3275 : : static bool
3276 : 12243182 : vect_analyze_data_ref_access (vec_info *vinfo, dr_vec_info *dr_info)
3277 : : {
3278 : 12243182 : data_reference *dr = dr_info->dr;
3279 : 12243182 : tree step = DR_STEP (dr);
3280 : 12243182 : tree scalar_type = TREE_TYPE (DR_REF (dr));
3281 : 12243182 : stmt_vec_info stmt_info = dr_info->stmt;
3282 : 12243182 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3283 : 12243182 : class loop *loop = NULL;
3284 : :
3285 : 12243182 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
3286 : : return true;
3287 : :
3288 : 12211166 : if (loop_vinfo)
3289 : 711122 : loop = LOOP_VINFO_LOOP (loop_vinfo);
3290 : :
3291 : 12211166 : if (loop_vinfo && !step)
3292 : : {
3293 : 0 : if (dump_enabled_p ())
3294 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3295 : : "bad data-ref access in loop\n");
3296 : 0 : return false;
3297 : : }
3298 : :
3299 : : /* Allow loads with zero step in inner-loop vectorization. */
3300 : 12211166 : if (loop_vinfo && integer_zerop (step))
3301 : : {
3302 : 9657 : DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3303 : 9657 : DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3304 : 9657 : if (!nested_in_vect_loop_p (loop, stmt_info))
3305 : 9393 : return DR_IS_READ (dr);
3306 : : /* Allow references with zero step for outer loops marked
3307 : : with pragma omp simd only - it guarantees absence of
3308 : : loop-carried dependencies between inner loop iterations. */
3309 : 264 : if (loop->safelen < 2)
3310 : : {
3311 : 228 : if (dump_enabled_p ())
3312 : 5 : dump_printf_loc (MSG_NOTE, vect_location,
3313 : : "zero step in inner loop of nest\n");
3314 : 228 : return false;
3315 : : }
3316 : : }
3317 : :
3318 : 12201509 : if (loop && nested_in_vect_loop_p (loop, stmt_info))
3319 : : {
3320 : : /* Interleaved accesses are not yet supported within outer-loop
3321 : : vectorization for references in the inner-loop. */
3322 : 5136 : DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3323 : 5136 : DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3324 : :
3325 : : /* For the rest of the analysis we use the outer-loop step. */
3326 : 5136 : step = STMT_VINFO_DR_STEP (stmt_info);
3327 : 5136 : if (integer_zerop (step))
3328 : : {
3329 : 1184 : if (dump_enabled_p ())
3330 : 226 : dump_printf_loc (MSG_NOTE, vect_location,
3331 : : "zero step in outer loop.\n");
3332 : 1184 : return DR_IS_READ (dr);
3333 : : }
3334 : : }
3335 : :
3336 : : /* Consecutive? */
3337 : 12200361 : if (TREE_CODE (step) == INTEGER_CST)
3338 : : {
3339 : 12164285 : HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
3340 : 12164285 : if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
3341 : 12164285 : || (dr_step < 0
3342 : 19279 : && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
3343 : : {
3344 : : /* Mark that it is not interleaving. */
3345 : 417521 : DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
3346 : 417521 : DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
3347 : 417521 : return true;
3348 : : }
3349 : : }
3350 : :
3351 : 11782840 : if (loop && nested_in_vect_loop_p (loop, stmt_info))
3352 : : {
3353 : 2850 : if (dump_enabled_p ())
3354 : 141 : dump_printf_loc (MSG_NOTE, vect_location,
3355 : : "grouped access in outer loop.\n");
3356 : 2850 : return false;
3357 : : }
3358 : :
3359 : :
3360 : : /* Assume this is a DR handled by non-constant strided load case. */
3361 : 11779990 : if (TREE_CODE (step) != INTEGER_CST)
3362 : 35672 : return (STMT_VINFO_STRIDED_P (stmt_info)
3363 : 35672 : && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
3364 : 8433 : || vect_analyze_group_access (vinfo, dr_info)));
3365 : :
3366 : : /* Not consecutive access - check if it's a part of interleaving group. */
3367 : 11744318 : return vect_analyze_group_access (vinfo, dr_info);
3368 : : }
3369 : :
3370 : : /* Compare two data-references DRA and DRB to group them into chunks
3371 : : suitable for grouping. */
3372 : :
3373 : : static int
3374 : 317714660 : dr_group_sort_cmp (const void *dra_, const void *drb_)
3375 : : {
3376 : 317714660 : dr_vec_info *dra_info = *(dr_vec_info **)const_cast<void *>(dra_);
3377 : 317714660 : dr_vec_info *drb_info = *(dr_vec_info **)const_cast<void *>(drb_);
3378 : 317714660 : data_reference_p dra = dra_info->dr;
3379 : 317714660 : data_reference_p drb = drb_info->dr;
3380 : 317714660 : int cmp;
3381 : :
3382 : : /* Stabilize sort. */
3383 : 317714660 : if (dra == drb)
3384 : : return 0;
3385 : :
3386 : : /* Different group IDs lead never belong to the same group. */
3387 : 317714660 : if (dra_info->group != drb_info->group)
3388 : 346186787 : return dra_info->group < drb_info->group ? -1 : 1;
3389 : :
3390 : : /* Ordering of DRs according to base. */
3391 : 90014339 : cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3392 : : DR_BASE_ADDRESS (drb));
3393 : 90014339 : if (cmp != 0)
3394 : : return cmp;
3395 : :
3396 : : /* And according to DR_OFFSET. */
3397 : 47464574 : cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
3398 : 47464574 : if (cmp != 0)
3399 : : return cmp;
3400 : :
3401 : : /* Put reads before writes. */
3402 : 47122116 : if (DR_IS_READ (dra) != DR_IS_READ (drb))
3403 : 3899653 : return DR_IS_READ (dra) ? -1 : 1;
3404 : :
3405 : : /* Then sort after access size. */
3406 : 44448706 : cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
3407 : 44448706 : TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
3408 : 44448706 : if (cmp != 0)
3409 : : return cmp;
3410 : :
3411 : : /* And after step. */
3412 : 38854200 : cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
3413 : 38854200 : if (cmp != 0)
3414 : : return cmp;
3415 : :
3416 : : /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
3417 : 38847866 : cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
3418 : 38847866 : if (cmp == 0)
3419 : 305107 : return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
3420 : : return cmp;
3421 : : }
3422 : :
3423 : : /* If OP is the result of a conversion, return the unconverted value,
3424 : : otherwise return null. */
3425 : :
3426 : : static tree
3427 : 410 : strip_conversion (tree op)
3428 : : {
3429 : 410 : if (TREE_CODE (op) != SSA_NAME)
3430 : : return NULL_TREE;
3431 : 410 : gimple *stmt = SSA_NAME_DEF_STMT (op);
3432 : 410 : if (!is_gimple_assign (stmt)
3433 : 410 : || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
3434 : : return NULL_TREE;
3435 : 300 : return gimple_assign_rhs1 (stmt);
3436 : : }
3437 : :
3438 : : /* Return true if vectorizable_* routines can handle statements STMT1_INFO
3439 : : and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can
3440 : : be grouped in SLP mode. */
3441 : :
3442 : : static bool
3443 : 6480000 : can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
3444 : : bool allow_slp_p)
3445 : : {
3446 : 6480000 : if (gimple_assign_single_p (stmt1_info->stmt))
3447 : 6479304 : return gimple_assign_single_p (stmt2_info->stmt);
3448 : :
3449 : 696 : gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
3450 : 696 : if (call1 && gimple_call_internal_p (call1))
3451 : : {
3452 : : /* Check for two masked loads or two masked stores. */
3453 : 952 : gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
3454 : 685 : if (!call2 || !gimple_call_internal_p (call2))
3455 : : return false;
3456 : 685 : internal_fn ifn = gimple_call_internal_fn (call1);
3457 : 685 : if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
3458 : : return false;
3459 : 685 : if (ifn != gimple_call_internal_fn (call2))
3460 : : return false;
3461 : :
3462 : : /* Check that the masks are the same. Cope with casts of masks,
3463 : : like those created by build_mask_conversion. */
3464 : 685 : tree mask1 = gimple_call_arg (call1, 2);
3465 : 685 : tree mask2 = gimple_call_arg (call2, 2);
3466 : 685 : if (!operand_equal_p (mask1, mask2, 0) && !allow_slp_p)
3467 : : {
3468 : 260 : mask1 = strip_conversion (mask1);
3469 : 260 : if (!mask1)
3470 : : return false;
3471 : 150 : mask2 = strip_conversion (mask2);
3472 : 150 : if (!mask2)
3473 : : return false;
3474 : 150 : if (!operand_equal_p (mask1, mask2, 0))
3475 : : return false;
3476 : : }
3477 : 429 : return true;
3478 : : }
3479 : :
3480 : : return false;
3481 : : }
3482 : :
3483 : : /* Function vect_analyze_data_ref_accesses.
3484 : :
3485 : : Analyze the access pattern of all the data references in the loop.
3486 : :
3487 : : FORNOW: the only access pattern that is considered vectorizable is a
3488 : : simple step 1 (consecutive) access.
3489 : :
3490 : : FORNOW: handle only arrays and pointer accesses. */
3491 : :
3492 : : opt_result
3493 : 2609068 : vect_analyze_data_ref_accesses (vec_info *vinfo,
3494 : : vec<int> *dataref_groups)
3495 : : {
3496 : 2609068 : unsigned int i;
3497 : 2609068 : vec<data_reference_p> datarefs = vinfo->shared->datarefs;
3498 : :
3499 : 2609068 : DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
3500 : :
3501 : 2609068 : if (datarefs.is_empty ())
3502 : 1176535 : return opt_result::success ();
3503 : :
3504 : : /* Sort the array of datarefs to make building the interleaving chains
3505 : : linear. Don't modify the original vector's order, it is needed for
3506 : : determining what dependencies are reversed. */
3507 : 1432533 : vec<dr_vec_info *> datarefs_copy;
3508 : 1432533 : datarefs_copy.create (datarefs.length ());
3509 : 15259446 : for (unsigned i = 0; i < datarefs.length (); i++)
3510 : : {
3511 : 13826913 : dr_vec_info *dr_info = vinfo->lookup_dr (datarefs[i]);
3512 : : /* If the caller computed DR grouping use that, otherwise group by
3513 : : basic blocks. */
3514 : 13826913 : if (dataref_groups)
3515 : 13073561 : dr_info->group = (*dataref_groups)[i];
3516 : : else
3517 : 753352 : dr_info->group = gimple_bb (DR_STMT (datarefs[i]))->index;
3518 : 13826913 : datarefs_copy.quick_push (dr_info);
3519 : : }
3520 : 1432533 : datarefs_copy.qsort (dr_group_sort_cmp);
3521 : 1432533 : hash_set<stmt_vec_info> to_fixup;
3522 : :
3523 : : /* Build the interleaving chains. */
3524 : 13054201 : for (i = 0; i < datarefs_copy.length () - 1;)
3525 : : {
3526 : 10189135 : dr_vec_info *dr_info_a = datarefs_copy[i];
3527 : 10189135 : data_reference_p dra = dr_info_a->dr;
3528 : 10189135 : int dra_group_id = dr_info_a->group;
3529 : 10189135 : stmt_vec_info stmtinfo_a = dr_info_a->stmt;
3530 : 10189135 : stmt_vec_info lastinfo = NULL;
3531 : 10189135 : if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
3532 : 8738038 : || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
3533 : : {
3534 : 1476732 : ++i;
3535 : 1476732 : continue;
3536 : : }
3537 : 22795812 : for (i = i + 1; i < datarefs_copy.length (); ++i)
3538 : : {
3539 : 10917648 : dr_vec_info *dr_info_b = datarefs_copy[i];
3540 : 10917648 : data_reference_p drb = dr_info_b->dr;
3541 : 10917648 : int drb_group_id = dr_info_b->group;
3542 : 10917648 : stmt_vec_info stmtinfo_b = dr_info_b->stmt;
3543 : 10917648 : if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
3544 : 10636543 : || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
3545 : : break;
3546 : :
3547 : : /* ??? Imperfect sorting (non-compatible types, non-modulo
3548 : : accesses, same accesses) can lead to a group to be artificially
3549 : : split here as we don't just skip over those. If it really
3550 : : matters we can push those to a worklist and re-iterate
3551 : : over them. The we can just skip ahead to the next DR here. */
3552 : :
3553 : : /* DRs in a different DR group should not be put into the same
3554 : : interleaving group. */
3555 : 10633647 : if (dra_group_id != drb_group_id)
3556 : : break;
3557 : :
3558 : : /* Check that the data-refs have same first location (except init)
3559 : : and they are both either store or load (not load and store,
3560 : : not masked loads or stores). */
3561 : 6799450 : if (DR_IS_READ (dra) != DR_IS_READ (drb)
3562 : 5558810 : || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
3563 : : DR_BASE_ADDRESS (drb)) != 0
3564 : 4037575 : || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
3565 : 10817739 : || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
3566 : : break;
3567 : :
3568 : : /* Check that the data-refs have the same constant size. */
3569 : 4018272 : tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3570 : 4018272 : tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3571 : 4018272 : if (!tree_fits_uhwi_p (sza)
3572 : 4018272 : || !tree_fits_uhwi_p (szb)
3573 : 8036544 : || !tree_int_cst_equal (sza, szb))
3574 : : break;
3575 : :
3576 : : /* Check that the data-refs have the same step. */
3577 : 3712027 : if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3578 : : break;
3579 : :
3580 : : /* Check the types are compatible.
3581 : : ??? We don't distinguish this during sorting. */
3582 : 3711407 : if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3583 : 3711407 : TREE_TYPE (DR_REF (drb))))
3584 : : break;
3585 : :
3586 : : /* Check that the DR_INITs are compile-time constants. */
3587 : 2631721 : if (!tree_fits_shwi_p (DR_INIT (dra))
3588 : 2631721 : || !tree_fits_shwi_p (DR_INIT (drb)))
3589 : : break;
3590 : :
3591 : : /* Different .GOMP_SIMD_LANE calls still give the same lane,
3592 : : just hold extra information. */
3593 : 2631721 : if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3594 : 1240 : && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3595 : 2632961 : && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3596 : : break;
3597 : :
3598 : : /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
3599 : 2630481 : HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3600 : 2630481 : HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3601 : 2630481 : HOST_WIDE_INT init_prev
3602 : 2630481 : = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]->dr));
3603 : 2630481 : gcc_assert (init_a <= init_b
3604 : : && init_a <= init_prev
3605 : : && init_prev <= init_b);
3606 : :
3607 : : /* Do not place the same access in the interleaving chain twice. */
3608 : 2630481 : if (init_b == init_prev)
3609 : : {
3610 : 25531 : gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]->dr))
3611 : : < gimple_uid (DR_STMT (drb)));
3612 : : /* Simply link in duplicates and fix up the chain below. */
3613 : : }
3614 : : else
3615 : : {
3616 : : /* If init_b == init_a + the size of the type * k, we have an
3617 : : interleaving, and DRA is accessed before DRB. */
3618 : 2604950 : unsigned HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3619 : 2604950 : if (type_size_a == 0
3620 : 2604950 : || (((unsigned HOST_WIDE_INT)init_b - init_a)
3621 : 2604950 : % type_size_a != 0))
3622 : : break;
3623 : :
3624 : : /* If we have a store, the accesses are adjacent. This splits
3625 : : groups into chunks we support (we don't support vectorization
3626 : : of stores with gaps). */
3627 : 2603897 : if (!DR_IS_READ (dra)
3628 : 1685356 : && (((unsigned HOST_WIDE_INT)init_b - init_prev)
3629 : : != type_size_a))
3630 : : break;
3631 : :
3632 : : /* If the step (if not zero or non-constant) is smaller than the
3633 : : difference between data-refs' inits this splits groups into
3634 : : suitable sizes. */
3635 : 2451649 : if (tree_fits_shwi_p (DR_STEP (dra)))
3636 : : {
3637 : 2446337 : unsigned HOST_WIDE_INT step
3638 : 2446337 : = absu_hwi (tree_to_shwi (DR_STEP (dra)));
3639 : 2446337 : if (step != 0
3640 : 134220 : && step <= ((unsigned HOST_WIDE_INT)init_b - init_a))
3641 : : break;
3642 : : }
3643 : : }
3644 : :
3645 : 2461741 : if (dump_enabled_p ())
3646 : 31005 : dump_printf_loc (MSG_NOTE, vect_location,
3647 : 31005 : DR_IS_READ (dra)
3648 : : ? "Detected interleaving load %T and %T\n"
3649 : : : "Detected interleaving store %T and %T\n",
3650 : : DR_REF (dra), DR_REF (drb));
3651 : :
3652 : : /* Link the found element into the group list. */
3653 : 2461741 : if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3654 : : {
3655 : 1399608 : DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3656 : 1399608 : lastinfo = stmtinfo_a;
3657 : : }
3658 : 2461741 : DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3659 : 2461741 : DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3660 : 2461741 : lastinfo = stmtinfo_b;
3661 : :
3662 : 2461741 : if (! STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3663 : : {
3664 : 2461711 : STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3665 : 2461711 : = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3666 : :
3667 : 2461711 : if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3668 : 73 : dump_printf_loc (MSG_NOTE, vect_location,
3669 : : "Load suitable for SLP vectorization only.\n");
3670 : : }
3671 : :
3672 : 2461741 : if (init_b == init_prev
3673 : 25531 : && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3674 : 2477040 : && dump_enabled_p ())
3675 : 254 : dump_printf_loc (MSG_NOTE, vect_location,
3676 : : "Queuing group with duplicate access for fixup\n");
3677 : : }
3678 : : }
3679 : :
3680 : : /* Fixup groups with duplicate entries by splitting it. */
3681 : 1470420 : while (1)
3682 : : {
3683 : 1470420 : hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3684 : 1470420 : if (!(it != to_fixup.end ()))
3685 : : break;
3686 : 37887 : stmt_vec_info grp = *it;
3687 : 37887 : to_fixup.remove (grp);
3688 : :
3689 : : /* Find the earliest duplicate group member. */
3690 : 37887 : unsigned first_duplicate = -1u;
3691 : 37887 : stmt_vec_info next, g = grp;
3692 : 192824 : while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3693 : : {
3694 : 117050 : if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3695 : 117050 : DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3696 : 117050 : && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3697 : : first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3698 : : g = next;
3699 : : }
3700 : 37887 : if (first_duplicate == -1U)
3701 : 15299 : continue;
3702 : :
3703 : : /* Then move all stmts after the first duplicate to a new group.
3704 : : Note this is a heuristic but one with the property that *it
3705 : : is fixed up completely. */
3706 : 22588 : g = grp;
3707 : 22588 : stmt_vec_info newgroup = NULL, ng = grp;
3708 : 160334 : while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3709 : : {
3710 : 115158 : if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3711 : : {
3712 : 109899 : DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3713 : 109899 : if (!newgroup)
3714 : : {
3715 : 22588 : newgroup = next;
3716 : 22588 : STMT_VINFO_SLP_VECT_ONLY (newgroup)
3717 : 22588 : = STMT_VINFO_SLP_VECT_ONLY (grp);
3718 : : }
3719 : : else
3720 : 87311 : DR_GROUP_NEXT_ELEMENT (ng) = next;
3721 : 109899 : ng = next;
3722 : 109899 : DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3723 : : }
3724 : : else
3725 : : g = DR_GROUP_NEXT_ELEMENT (g);
3726 : : }
3727 : 22588 : DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3728 : :
3729 : : /* Fixup the new group which still may contain duplicates. */
3730 : 22588 : to_fixup.add (newgroup);
3731 : : }
3732 : :
3733 : 1432533 : dr_vec_info *dr_info;
3734 : 15243308 : FOR_EACH_VEC_ELT (datarefs_copy, i, dr_info)
3735 : : {
3736 : 13816699 : if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3737 : 13816699 : && !vect_analyze_data_ref_access (vinfo, dr_info))
3738 : : {
3739 : 6285 : if (dump_enabled_p ())
3740 : 255 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3741 : : "not vectorized: complicated access pattern.\n");
3742 : :
3743 : 6285 : if (is_a <bb_vec_info> (vinfo))
3744 : : {
3745 : : /* Mark the statement as not vectorizable. */
3746 : 361 : STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3747 : 361 : continue;
3748 : : }
3749 : : else
3750 : : {
3751 : 5924 : datarefs_copy.release ();
3752 : 5924 : return opt_result::failure_at (dr_info->stmt->stmt,
3753 : : "not vectorized:"
3754 : : " complicated access pattern.\n");
3755 : : }
3756 : : }
3757 : : }
3758 : :
3759 : 1426609 : datarefs_copy.release ();
3760 : 1426609 : return opt_result::success ();
3761 : 1432533 : }
3762 : :
3763 : : /* Function vect_vfa_segment_size.
3764 : :
3765 : : Input:
3766 : : DR_INFO: The data reference.
3767 : : LENGTH_FACTOR: segment length to consider.
3768 : :
3769 : : Return a value suitable for the dr_with_seg_len::seg_len field.
3770 : : This is the "distance travelled" by the pointer from the first
3771 : : iteration in the segment to the last. Note that it does not include
3772 : : the size of the access; in effect it only describes the first byte. */
3773 : :
3774 : : static tree
3775 : 107308 : vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3776 : : {
3777 : 107308 : length_factor = size_binop (MINUS_EXPR,
3778 : : fold_convert (sizetype, length_factor),
3779 : : size_one_node);
3780 : 107308 : return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3781 : : length_factor);
3782 : : }
3783 : :
3784 : : /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3785 : : gives the worst-case number of bytes covered by the segment. */
3786 : :
3787 : : static unsigned HOST_WIDE_INT
3788 : 107790 : vect_vfa_access_size (vec_info *vinfo, dr_vec_info *dr_info)
3789 : : {
3790 : 107790 : stmt_vec_info stmt_vinfo = dr_info->stmt;
3791 : 107790 : tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3792 : 107790 : unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3793 : 107790 : unsigned HOST_WIDE_INT access_size = ref_size;
3794 : 107790 : if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3795 : : {
3796 : 35554 : gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3797 : 35554 : access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3798 : : }
3799 : 107790 : tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3800 : 107790 : int misalignment;
3801 : 107790 : if (STMT_VINFO_VEC_STMTS (stmt_vinfo).exists ()
3802 : 107790 : && ((misalignment = dr_misalignment (dr_info, vectype)), true)
3803 : 107790 : && (vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment)
3804 : : == dr_explicit_realign_optimized))
3805 : : {
3806 : : /* We might access a full vector's worth. */
3807 : 0 : access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3808 : : }
3809 : 107790 : return access_size;
3810 : : }
3811 : :
3812 : : /* Get the minimum alignment for all the scalar accesses that DR_INFO
3813 : : describes. */
3814 : :
3815 : : static unsigned int
3816 : 107790 : vect_vfa_align (dr_vec_info *dr_info)
3817 : : {
3818 : 0 : return dr_alignment (dr_info->dr);
3819 : : }
3820 : :
3821 : : /* Function vect_no_alias_p.
3822 : :
3823 : : Given data references A and B with equal base and offset, see whether
3824 : : the alias relation can be decided at compilation time. Return 1 if
3825 : : it can and the references alias, 0 if it can and the references do
3826 : : not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A,
3827 : : SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3828 : : of dr_with_seg_len::{seg_len,access_size} for A and B. */
3829 : :
3830 : : static int
3831 : 1520 : vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3832 : : tree segment_length_a, tree segment_length_b,
3833 : : unsigned HOST_WIDE_INT access_size_a,
3834 : : unsigned HOST_WIDE_INT access_size_b)
3835 : : {
3836 : 1520 : poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3837 : 1520 : poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3838 : 1520 : poly_uint64 const_length_a;
3839 : 1520 : poly_uint64 const_length_b;
3840 : :
3841 : : /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3842 : : bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3843 : : [a, a+12) */
3844 : 1520 : if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3845 : : {
3846 : 136 : const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3847 : 136 : offset_a -= const_length_a;
3848 : : }
3849 : : else
3850 : 1384 : const_length_a = tree_to_poly_uint64 (segment_length_a);
3851 : 1520 : if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3852 : : {
3853 : 290 : const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3854 : 290 : offset_b -= const_length_b;
3855 : : }
3856 : : else
3857 : 1230 : const_length_b = tree_to_poly_uint64 (segment_length_b);
3858 : :
3859 : 1520 : const_length_a += access_size_a;
3860 : 1520 : const_length_b += access_size_b;
3861 : :
3862 : 1520 : if (ranges_known_overlap_p (offset_a, const_length_a,
3863 : : offset_b, const_length_b))
3864 : : return 1;
3865 : :
3866 : 458 : if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3867 : : offset_b, const_length_b))
3868 : 458 : return 0;
3869 : :
3870 : : return -1;
3871 : : }
3872 : :
3873 : : /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3874 : : in DDR is >= VF. */
3875 : :
3876 : : static bool
3877 : 63476 : dependence_distance_ge_vf (data_dependence_relation *ddr,
3878 : : unsigned int loop_depth, poly_uint64 vf)
3879 : : {
3880 : 63476 : if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3881 : 67555 : || DDR_NUM_DIST_VECTS (ddr) == 0)
3882 : : return false;
3883 : :
3884 : : /* If the dependence is exact, we should have limited the VF instead. */
3885 : 4134 : gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3886 : :
3887 : : unsigned int i;
3888 : : lambda_vector dist_v;
3889 : 8319 : FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3890 : : {
3891 : 8264 : HOST_WIDE_INT dist = dist_v[loop_depth];
3892 : 8264 : if (dist != 0
3893 : 4134 : && !(dist > 0 && DDR_REVERSED_P (ddr))
3894 : 12398 : && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3895 : : return false;
3896 : : }
3897 : :
3898 : 55 : if (dump_enabled_p ())
3899 : 14 : dump_printf_loc (MSG_NOTE, vect_location,
3900 : : "dependence distance between %T and %T is >= VF\n",
3901 : 14 : DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3902 : :
3903 : : return true;
3904 : : }
3905 : :
3906 : : /* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */
3907 : :
3908 : : static void
3909 : 436 : dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3910 : : {
3911 : 436 : dump_printf (dump_kind, "%s (%T) >= ",
3912 : 436 : lower_bound.unsigned_p ? "unsigned" : "abs",
3913 : 436 : lower_bound.expr);
3914 : 436 : dump_dec (dump_kind, lower_bound.min_value);
3915 : 436 : }
3916 : :
3917 : : /* Record that the vectorized loop requires the vec_lower_bound described
3918 : : by EXPR, UNSIGNED_P and MIN_VALUE. */
3919 : :
3920 : : static void
3921 : 5131 : vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3922 : : poly_uint64 min_value)
3923 : : {
3924 : 5131 : vec<vec_lower_bound> &lower_bounds
3925 : : = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3926 : 6103 : for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3927 : 4689 : if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3928 : : {
3929 : 3717 : unsigned_p &= lower_bounds[i].unsigned_p;
3930 : 3717 : min_value = upper_bound (lower_bounds[i].min_value, min_value);
3931 : 3717 : if (lower_bounds[i].unsigned_p != unsigned_p
3932 : 3717 : || maybe_lt (lower_bounds[i].min_value, min_value))
3933 : : {
3934 : 600 : lower_bounds[i].unsigned_p = unsigned_p;
3935 : 600 : lower_bounds[i].min_value = min_value;
3936 : 600 : if (dump_enabled_p ())
3937 : : {
3938 : 246 : dump_printf_loc (MSG_NOTE, vect_location,
3939 : : "updating run-time check to ");
3940 : 246 : dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3941 : 246 : dump_printf (MSG_NOTE, "\n");
3942 : : }
3943 : : }
3944 : 3717 : return;
3945 : : }
3946 : :
3947 : 1414 : vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3948 : 1414 : if (dump_enabled_p ())
3949 : : {
3950 : 190 : dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3951 : 190 : dump_lower_bound (MSG_NOTE, lower_bound);
3952 : 190 : dump_printf (MSG_NOTE, "\n");
3953 : : }
3954 : 1414 : LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3955 : : }
3956 : :
3957 : : /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3958 : : will span fewer than GAP bytes. */
3959 : :
3960 : : static bool
3961 : 3666 : vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3962 : : poly_int64 gap)
3963 : : {
3964 : 3666 : stmt_vec_info stmt_info = dr_info->stmt;
3965 : 3666 : HOST_WIDE_INT count
3966 : 3666 : = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3967 : 3666 : if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3968 : 3618 : count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3969 : 3666 : return (estimated_poly_value (gap)
3970 : 3666 : <= count * vect_get_scalar_dr_size (dr_info));
3971 : : }
3972 : :
3973 : : /* Return true if we know that there is no alias between DR_INFO_A and
3974 : : DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3975 : : When returning true, set *LOWER_BOUND_OUT to this N. */
3976 : :
3977 : : static bool
3978 : 16960 : vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3979 : : poly_uint64 *lower_bound_out)
3980 : : {
3981 : : /* Check that there is a constant gap of known sign between DR_A
3982 : : and DR_B. */
3983 : 16960 : data_reference *dr_a = dr_info_a->dr;
3984 : 16960 : data_reference *dr_b = dr_info_b->dr;
3985 : 16960 : poly_int64 init_a, init_b;
3986 : 16960 : if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3987 : 6664 : || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3988 : 6006 : || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3989 : 6004 : || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3990 : 6004 : || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3991 : 16960 : || !ordered_p (init_a, init_b))
3992 : 10956 : return false;
3993 : :
3994 : : /* Sort DR_A and DR_B by the address they access. */
3995 : 6004 : if (maybe_lt (init_b, init_a))
3996 : : {
3997 : 128 : std::swap (init_a, init_b);
3998 : 128 : std::swap (dr_info_a, dr_info_b);
3999 : 128 : std::swap (dr_a, dr_b);
4000 : : }
4001 : :
4002 : : /* If the two accesses could be dependent within a scalar iteration,
4003 : : make sure that we'd retain their order. */
4004 : 6004 : if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
4005 : 6004 : && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
4006 : : return false;
4007 : :
4008 : : /* There is no alias if abs (DR_STEP) is greater than or equal to
4009 : : the bytes spanned by the combination of the two accesses. */
4010 : 6004 : *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
4011 : 6004 : return true;
4012 : : }
4013 : :
4014 : : /* Function vect_prune_runtime_alias_test_list.
4015 : :
4016 : : Prune a list of ddrs to be tested at run-time by versioning for alias.
4017 : : Merge several alias checks into one if possible.
4018 : : Return FALSE if resulting list of ddrs is longer then allowed by
4019 : : PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
4020 : :
4021 : : opt_result
4022 : 314609 : vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
4023 : : {
4024 : 314609 : typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
4025 : 314609 : hash_set <tree_pair_hash> compared_objects;
4026 : :
4027 : 314609 : const vec<ddr_p> &may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
4028 : 314609 : vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
4029 : : = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
4030 : 314609 : const vec<vec_object_pair> &check_unequal_addrs
4031 : : = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
4032 : 314609 : poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4033 : 314609 : tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
4034 : :
4035 : 314609 : ddr_p ddr;
4036 : 314609 : unsigned int i;
4037 : 314609 : tree length_factor;
4038 : :
4039 : 314609 : DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
4040 : :
4041 : : /* Step values are irrelevant for aliasing if the number of vector
4042 : : iterations is equal to the number of scalar iterations (which can
4043 : : happen for fully-SLP loops). */
4044 : 314609 : bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
4045 : :
4046 : 314609 : if (!vf_one_p)
4047 : : {
4048 : : /* Convert the checks for nonzero steps into bound tests. */
4049 : : tree value;
4050 : 312607 : FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
4051 : 1513 : vect_check_lower_bound (loop_vinfo, value, true, 1);
4052 : : }
4053 : :
4054 : 314609 : if (may_alias_ddrs.is_empty ())
4055 : 297736 : return opt_result::success ();
4056 : :
4057 : 16873 : comp_alias_ddrs.create (may_alias_ddrs.length ());
4058 : :
4059 : 16873 : unsigned int loop_depth
4060 : 16873 : = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
4061 : 16873 : LOOP_VINFO_LOOP_NEST (loop_vinfo));
4062 : :
4063 : : /* First, we collect all data ref pairs for aliasing checks. */
4064 : 79287 : FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
4065 : : {
4066 : 63476 : poly_uint64 lower_bound;
4067 : 63476 : tree segment_length_a, segment_length_b;
4068 : 63476 : unsigned HOST_WIDE_INT access_size_a, access_size_b;
4069 : 63476 : unsigned int align_a, align_b;
4070 : :
4071 : : /* Ignore the alias if the VF we chose ended up being no greater
4072 : : than the dependence distance. */
4073 : 63476 : if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
4074 : 10039 : continue;
4075 : :
4076 : 63421 : if (DDR_OBJECT_A (ddr))
4077 : : {
4078 : 46 : vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
4079 : 46 : if (!compared_objects.add (new_pair))
4080 : : {
4081 : 14 : if (dump_enabled_p ())
4082 : 8 : dump_printf_loc (MSG_NOTE, vect_location,
4083 : : "checking that %T and %T"
4084 : : " have different addresses\n",
4085 : : new_pair.first, new_pair.second);
4086 : 14 : LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
4087 : : }
4088 : 46 : continue;
4089 : 46 : }
4090 : :
4091 : 63375 : dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
4092 : 63375 : stmt_vec_info stmt_info_a = dr_info_a->stmt;
4093 : :
4094 : 63375 : dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
4095 : 63375 : stmt_vec_info stmt_info_b = dr_info_b->stmt;
4096 : :
4097 : 63375 : bool preserves_scalar_order_p
4098 : 63375 : = vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
4099 : 63375 : bool ignore_step_p
4100 : : = (vf_one_p
4101 : 63375 : && (preserves_scalar_order_p
4102 : 2889 : || operand_equal_p (DR_STEP (dr_info_a->dr),
4103 : 2889 : DR_STEP (dr_info_b->dr))));
4104 : :
4105 : : /* Skip the pair if inter-iteration dependencies are irrelevant
4106 : : and intra-iteration dependencies are guaranteed to be honored. */
4107 : 11965 : if (ignore_step_p
4108 : 6103 : && (preserves_scalar_order_p
4109 : 2603 : || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4110 : : &lower_bound)))
4111 : : {
4112 : 5862 : if (dump_enabled_p ())
4113 : 2352 : dump_printf_loc (MSG_NOTE, vect_location,
4114 : : "no need for alias check between "
4115 : : "%T and %T when VF is 1\n",
4116 : 2352 : DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4117 : 5862 : continue;
4118 : : }
4119 : :
4120 : : /* See whether we can handle the alias using a bounds check on
4121 : : the step, and whether that's likely to be the best approach.
4122 : : (It might not be, for example, if the minimum step is much larger
4123 : : than the number of bytes handled by one vector iteration.) */
4124 : 57513 : if (!ignore_step_p
4125 : 57272 : && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
4126 : 14357 : && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
4127 : : &lower_bound)
4128 : 61155 : && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
4129 : 24 : || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
4130 : : {
4131 : 3618 : bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
4132 : 3618 : if (dump_enabled_p ())
4133 : : {
4134 : 3312 : dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
4135 : : "%T and %T when the step %T is outside ",
4136 : : DR_REF (dr_info_a->dr),
4137 : 1656 : DR_REF (dr_info_b->dr),
4138 : 1656 : DR_STEP (dr_info_a->dr));
4139 : 1656 : if (unsigned_p)
4140 : 504 : dump_printf (MSG_NOTE, "[0");
4141 : : else
4142 : : {
4143 : 1152 : dump_printf (MSG_NOTE, "(");
4144 : 1152 : dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
4145 : : }
4146 : 1656 : dump_printf (MSG_NOTE, ", ");
4147 : 1656 : dump_dec (MSG_NOTE, lower_bound);
4148 : 1656 : dump_printf (MSG_NOTE, ")\n");
4149 : : }
4150 : 3618 : vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
4151 : : unsigned_p, lower_bound);
4152 : 3618 : continue;
4153 : 3618 : }
4154 : :
4155 : 53895 : stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
4156 : 53895 : if (dr_group_first_a)
4157 : : {
4158 : 17839 : stmt_info_a = dr_group_first_a;
4159 : 17839 : dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
4160 : : }
4161 : :
4162 : 53895 : stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
4163 : 53895 : if (dr_group_first_b)
4164 : : {
4165 : 17715 : stmt_info_b = dr_group_first_b;
4166 : 17715 : dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
4167 : : }
4168 : :
4169 : 53895 : if (ignore_step_p)
4170 : : {
4171 : 241 : segment_length_a = size_zero_node;
4172 : 241 : segment_length_b = size_zero_node;
4173 : : }
4174 : : else
4175 : : {
4176 : 53654 : if (!operand_equal_p (DR_STEP (dr_info_a->dr),
4177 : 53654 : DR_STEP (dr_info_b->dr), 0))
4178 : : length_factor = scalar_loop_iters;
4179 : : else
4180 : 41707 : length_factor = size_int (vect_factor);
4181 : 53654 : segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
4182 : 53654 : segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
4183 : : }
4184 : 53895 : access_size_a = vect_vfa_access_size (loop_vinfo, dr_info_a);
4185 : 53895 : access_size_b = vect_vfa_access_size (loop_vinfo, dr_info_b);
4186 : 53895 : align_a = vect_vfa_align (dr_info_a);
4187 : 53895 : align_b = vect_vfa_align (dr_info_b);
4188 : :
4189 : : /* See whether the alias is known at compilation time. */
4190 : 53895 : if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
4191 : 53895 : DR_BASE_ADDRESS (dr_info_b->dr), 0)
4192 : 3065 : && operand_equal_p (DR_OFFSET (dr_info_a->dr),
4193 : 3065 : DR_OFFSET (dr_info_b->dr), 0)
4194 : 1594 : && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
4195 : 1568 : && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
4196 : 1568 : && poly_int_tree_p (segment_length_a)
4197 : 55426 : && poly_int_tree_p (segment_length_b))
4198 : : {
4199 : 1520 : int res = vect_compile_time_alias (dr_info_a, dr_info_b,
4200 : : segment_length_a,
4201 : : segment_length_b,
4202 : : access_size_a,
4203 : : access_size_b);
4204 : 1520 : if (res >= 0 && dump_enabled_p ())
4205 : : {
4206 : 232 : dump_printf_loc (MSG_NOTE, vect_location,
4207 : : "can tell at compile time that %T and %T",
4208 : 116 : DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
4209 : 116 : if (res == 0)
4210 : 57 : dump_printf (MSG_NOTE, " do not alias\n");
4211 : : else
4212 : 59 : dump_printf (MSG_NOTE, " alias\n");
4213 : : }
4214 : :
4215 : 1520 : if (res == 0)
4216 : 458 : continue;
4217 : :
4218 : 1062 : if (res == 1)
4219 : 1062 : return opt_result::failure_at (stmt_info_b->stmt,
4220 : : "not vectorized:"
4221 : : " compilation time alias: %G%G",
4222 : : stmt_info_a->stmt,
4223 : : stmt_info_b->stmt);
4224 : : }
4225 : :
4226 : 52375 : dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
4227 : 52375 : access_size_a, align_a);
4228 : 52375 : dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
4229 : 52375 : access_size_b, align_b);
4230 : : /* Canonicalize the order to be the one that's needed for accurate
4231 : : RAW, WAR and WAW flags, in cases where the data references are
4232 : : well-ordered. The order doesn't really matter otherwise,
4233 : : but we might as well be consistent. */
4234 : 52375 : if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
4235 : 4415 : std::swap (dr_a, dr_b);
4236 : :
4237 : 52375 : dr_with_seg_len_pair_t dr_with_seg_len_pair
4238 : : (dr_a, dr_b, (preserves_scalar_order_p
4239 : : ? dr_with_seg_len_pair_t::WELL_ORDERED
4240 : 58136 : : dr_with_seg_len_pair_t::REORDERED));
4241 : :
4242 : 52375 : comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
4243 : : }
4244 : :
4245 : 15811 : prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
4246 : :
4247 : 31622 : unsigned int count = (comp_alias_ddrs.length ()
4248 : 15811 : + check_unequal_addrs.length ());
4249 : :
4250 : 15811 : if (count
4251 : 15811 : && (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
4252 : : == VECT_COST_MODEL_VERY_CHEAP))
4253 : 10206 : return opt_result::failure_at
4254 : 10206 : (vect_location, "would need a runtime alias check\n");
4255 : :
4256 : 5605 : if (dump_enabled_p ())
4257 : 1854 : dump_printf_loc (MSG_NOTE, vect_location,
4258 : : "improved number of alias checks from %d to %d\n",
4259 : : may_alias_ddrs.length (), count);
4260 : 5605 : unsigned limit = param_vect_max_version_for_alias_checks;
4261 : 5605 : if (loop_cost_model (LOOP_VINFO_LOOP (loop_vinfo)) == VECT_COST_MODEL_CHEAP)
4262 : 741 : limit = param_vect_max_version_for_alias_checks * 6 / 10;
4263 : 5605 : if (count > limit)
4264 : 150 : return opt_result::failure_at
4265 : 150 : (vect_location,
4266 : : "number of versioning for alias run-time tests exceeds %d "
4267 : : "(--param vect-max-version-for-alias-checks)\n", limit);
4268 : :
4269 : 5455 : return opt_result::success ();
4270 : 314609 : }
4271 : :
4272 : : /* Check whether we can use an internal function for a gather load
4273 : : or scatter store. READ_P is true for loads and false for stores.
4274 : : MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
4275 : : the type of the memory elements being loaded or stored. OFFSET_TYPE
4276 : : is the type of the offset that is being applied to the invariant
4277 : : base address. SCALE is the amount by which the offset should
4278 : : be multiplied *after* it has been converted to address width.
4279 : :
4280 : : Return true if the function is supported, storing the function id in
4281 : : *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.
4282 : :
4283 : : If we can use gather and store the possible else values in ELSVALS. */
4284 : :
4285 : : bool
4286 : 42146 : vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
4287 : : tree vectype, tree memory_type, tree offset_type,
4288 : : int scale, internal_fn *ifn_out,
4289 : : tree *offset_vectype_out, vec<int> *elsvals)
4290 : : {
4291 : 42146 : unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
4292 : 42146 : unsigned int element_bits = vector_element_bits (vectype);
4293 : 42146 : if (element_bits != memory_bits)
4294 : : /* For now the vector elements must be the same width as the
4295 : : memory elements. */
4296 : : return false;
4297 : :
4298 : : /* Work out which function we need. */
4299 : 42146 : internal_fn ifn, alt_ifn, alt_ifn2;
4300 : 42146 : if (read_p)
4301 : : {
4302 : 15170 : ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
4303 : : alt_ifn = IFN_MASK_GATHER_LOAD;
4304 : : /* When target supports MASK_LEN_GATHER_LOAD, we always
4305 : : use MASK_LEN_GATHER_LOAD regardless whether len and
4306 : : mask are valid or not. */
4307 : : alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
4308 : : }
4309 : : else
4310 : : {
4311 : 26976 : ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
4312 : 42146 : alt_ifn = IFN_MASK_SCATTER_STORE;
4313 : : /* When target supports MASK_LEN_SCATTER_STORE, we always
4314 : : use MASK_LEN_SCATTER_STORE regardless whether len and
4315 : : mask are valid or not. */
4316 : 42146 : alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
4317 : : }
4318 : :
4319 : 302132 : for (;;)
4320 : : {
4321 : 172139 : tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
4322 : 172139 : if (!offset_vectype)
4323 : : return false;
4324 : :
4325 : : /* Test whether the target supports this combination. */
4326 : 168645 : if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
4327 : : offset_vectype, scale,
4328 : : elsvals))
4329 : : {
4330 : 0 : *ifn_out = ifn;
4331 : 0 : *offset_vectype_out = offset_vectype;
4332 : 0 : return true;
4333 : : }
4334 : 168645 : else if (!masked_p
4335 : 168645 : && internal_gather_scatter_fn_supported_p (alt_ifn, vectype,
4336 : : memory_type,
4337 : : offset_vectype,
4338 : : scale, elsvals))
4339 : : {
4340 : 0 : *ifn_out = alt_ifn;
4341 : 0 : *offset_vectype_out = offset_vectype;
4342 : 0 : return true;
4343 : : }
4344 : 168645 : else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
4345 : : memory_type,
4346 : : offset_vectype, scale,
4347 : : elsvals))
4348 : : {
4349 : 0 : *ifn_out = alt_ifn2;
4350 : 0 : *offset_vectype_out = offset_vectype;
4351 : 0 : return true;
4352 : : }
4353 : :
4354 : 168645 : if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
4355 : 168645 : && TYPE_PRECISION (offset_type) >= element_bits)
4356 : : return false;
4357 : :
4358 : 129993 : offset_type = build_nonstandard_integer_type
4359 : 129993 : (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
4360 : 129993 : }
4361 : : }
4362 : :
4363 : : /* STMT_INFO is a call to an internal gather load or scatter store function.
4364 : : Describe the operation in INFO. */
4365 : :
4366 : : static void
4367 : 0 : vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
4368 : : gather_scatter_info *info)
4369 : : {
4370 : 0 : gcall *call = as_a <gcall *> (stmt_info->stmt);
4371 : 0 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4372 : 0 : data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4373 : :
4374 : 0 : info->ifn = gimple_call_internal_fn (call);
4375 : 0 : info->decl = NULL_TREE;
4376 : 0 : info->base = gimple_call_arg (call, 0);
4377 : 0 : info->offset = gimple_call_arg (call, 1);
4378 : 0 : info->offset_dt = vect_unknown_def_type;
4379 : 0 : info->offset_vectype = NULL_TREE;
4380 : 0 : info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
4381 : 0 : info->element_type = TREE_TYPE (vectype);
4382 : 0 : info->memory_type = TREE_TYPE (DR_REF (dr));
4383 : 0 : }
4384 : :
4385 : : /* Return true if a non-affine read or write in STMT_INFO is suitable for a
4386 : : gather load or scatter store. Describe the operation in *INFO if so.
4387 : : If it is suitable and ELSVALS is nonzero store the supported else values
4388 : : in the vector it points to. */
4389 : :
4390 : : bool
4391 : 164171 : vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
4392 : : gather_scatter_info *info, vec<int> *elsvals)
4393 : : {
4394 : 164171 : HOST_WIDE_INT scale = 1;
4395 : 164171 : poly_int64 pbitpos, pbitsize;
4396 : 164171 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4397 : 164171 : struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4398 : 164171 : tree offtype = NULL_TREE;
4399 : 164171 : tree decl = NULL_TREE, base, off;
4400 : 164171 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4401 : 164171 : tree memory_type = TREE_TYPE (DR_REF (dr));
4402 : 164171 : machine_mode pmode;
4403 : 164171 : int punsignedp, reversep, pvolatilep = 0;
4404 : 164171 : internal_fn ifn;
4405 : 164171 : tree offset_vectype;
4406 : 164171 : bool masked_p = false;
4407 : :
4408 : : /* See whether this is already a call to a gather/scatter internal function.
4409 : : If not, see whether it's a masked load or store. */
4410 : 164171 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4411 : 7348 : if (call && gimple_call_internal_p (call))
4412 : : {
4413 : 7348 : ifn = gimple_call_internal_fn (call);
4414 : 7348 : if (internal_gather_scatter_fn_p (ifn))
4415 : : {
4416 : 0 : vect_describe_gather_scatter_call (stmt_info, info);
4417 : :
4418 : : /* In pattern recog we simply used a ZERO else value that
4419 : : we need to correct here. To that end just re-use the
4420 : : (already succesful) check if we support a gather IFN
4421 : : and have it populate the else values. */
4422 : 0 : if (DR_IS_READ (dr) && internal_fn_mask_index (ifn) >= 0 && elsvals)
4423 : 0 : supports_vec_gather_load_p (TYPE_MODE (vectype), elsvals);
4424 : 0 : return true;
4425 : : }
4426 : 7348 : masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
4427 : : }
4428 : :
4429 : : /* ??? For epilogues we adjust DR_REF to make the following stmt-based
4430 : : analysis work, but this adjustment doesn't work for epilogues of
4431 : : epilogues during transform, so disable gather/scatter in that case. */
4432 : 164171 : if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4433 : 9304 : && LOOP_VINFO_EPILOGUE_P (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)))
4434 : : return false;
4435 : :
4436 : : /* True if we should aim to use internal functions rather than
4437 : : built-in functions. */
4438 : 164147 : bool use_ifn_p = (DR_IS_READ (dr)
4439 : 164147 : ? supports_vec_gather_load_p (TYPE_MODE (vectype),
4440 : : elsvals)
4441 : 69692 : : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
4442 : :
4443 : 164147 : base = DR_REF (dr);
4444 : : /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
4445 : : see if we can use the def stmt of the address. */
4446 : 164147 : if (masked_p
4447 : 7348 : && TREE_CODE (base) == MEM_REF
4448 : 7348 : && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
4449 : 7348 : && integer_zerop (TREE_OPERAND (base, 1))
4450 : 171495 : && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
4451 : : {
4452 : 7348 : gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
4453 : 7348 : if (is_gimple_assign (def_stmt)
4454 : 7348 : && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
4455 : 791 : base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
4456 : : }
4457 : :
4458 : : /* The gather and scatter builtins need address of the form
4459 : : loop_invariant + vector * {1, 2, 4, 8}
4460 : : or
4461 : : loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
4462 : : Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
4463 : : of loop invariants/SSA_NAMEs defined in the loop, with casts,
4464 : : multiplications and additions in it. To get a vector, we need
4465 : : a single SSA_NAME that will be defined in the loop and will
4466 : : contain everything that is not loop invariant and that can be
4467 : : vectorized. The following code attempts to find such a preexistng
4468 : : SSA_NAME OFF and put the loop invariants into a tree BASE
4469 : : that can be gimplified before the loop. */
4470 : 164147 : base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
4471 : : &punsignedp, &reversep, &pvolatilep);
4472 : 164147 : if (reversep)
4473 : : return false;
4474 : :
4475 : : /* PR 107346. Packed structs can have fields at offsets that are not
4476 : : multiples of BITS_PER_UNIT. Do not use gather/scatters in such cases. */
4477 : 164147 : if (!multiple_p (pbitpos, BITS_PER_UNIT))
4478 : : return false;
4479 : :
4480 : : /* We need to be able to form an address to the base which for example
4481 : : isn't possible for hard registers. */
4482 : 164147 : if (may_be_nonaddressable_p (base))
4483 : : return false;
4484 : :
4485 : 164139 : poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
4486 : :
4487 : 164139 : if (TREE_CODE (base) == MEM_REF)
4488 : : {
4489 : 107958 : if (!integer_zerop (TREE_OPERAND (base, 1)))
4490 : : {
4491 : 1895 : if (off == NULL_TREE)
4492 : 1750 : off = wide_int_to_tree (sizetype, mem_ref_offset (base));
4493 : : else
4494 : 145 : off = size_binop (PLUS_EXPR, off,
4495 : : fold_convert (sizetype, TREE_OPERAND (base, 1)));
4496 : : }
4497 : 107958 : base = TREE_OPERAND (base, 0);
4498 : : }
4499 : : else
4500 : 56181 : base = build_fold_addr_expr (base);
4501 : :
4502 : 164139 : if (off == NULL_TREE)
4503 : 95020 : off = size_zero_node;
4504 : :
4505 : : /* If base is not loop invariant, either off is 0, then we start with just
4506 : : the constant offset in the loop invariant BASE and continue with base
4507 : : as OFF, otherwise give up.
4508 : : We could handle that case by gimplifying the addition of base + off
4509 : : into some SSA_NAME and use that as off, but for now punt. */
4510 : 164139 : if (!expr_invariant_in_loop_p (loop, base))
4511 : : {
4512 : 97265 : if (!integer_zerop (off))
4513 : : return false;
4514 : 95017 : off = base;
4515 : 95017 : base = size_int (pbytepos);
4516 : : }
4517 : : /* Otherwise put base + constant offset into the loop invariant BASE
4518 : : and continue with OFF. */
4519 : : else
4520 : : {
4521 : 66874 : base = fold_convert (sizetype, base);
4522 : 66874 : base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
4523 : : }
4524 : :
4525 : : /* OFF at this point may be either a SSA_NAME or some tree expression
4526 : : from get_inner_reference. Try to peel off loop invariants from it
4527 : : into BASE as long as possible. */
4528 : 161891 : STRIP_NOPS (off);
4529 : 539801 : while (offtype == NULL_TREE)
4530 : : {
4531 : 467431 : enum tree_code code;
4532 : 467431 : tree op0, op1, add = NULL_TREE;
4533 : :
4534 : 467431 : if (TREE_CODE (off) == SSA_NAME)
4535 : : {
4536 : 324214 : gimple *def_stmt = SSA_NAME_DEF_STMT (off);
4537 : :
4538 : 324214 : if (expr_invariant_in_loop_p (loop, off))
4539 : 16 : return false;
4540 : :
4541 : 324198 : if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
4542 : : break;
4543 : :
4544 : 304232 : op0 = gimple_assign_rhs1 (def_stmt);
4545 : 304232 : code = gimple_assign_rhs_code (def_stmt);
4546 : 304232 : op1 = gimple_assign_rhs2 (def_stmt);
4547 : : }
4548 : : else
4549 : : {
4550 : 143217 : if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
4551 : : return false;
4552 : 143217 : code = TREE_CODE (off);
4553 : 143217 : extract_ops_from_tree (off, &code, &op0, &op1);
4554 : : }
4555 : 447449 : switch (code)
4556 : : {
4557 : 126770 : case POINTER_PLUS_EXPR:
4558 : 126770 : case PLUS_EXPR:
4559 : 126770 : if (expr_invariant_in_loop_p (loop, op0))
4560 : : {
4561 : 81890 : add = op0;
4562 : 81890 : off = op1;
4563 : 117364 : do_add:
4564 : 117364 : add = fold_convert (sizetype, add);
4565 : 117364 : if (scale != 1)
4566 : 42373 : add = size_binop (MULT_EXPR, add, size_int (scale));
4567 : 117364 : base = size_binop (PLUS_EXPR, base, add);
4568 : 377910 : continue;
4569 : : }
4570 : 44880 : if (expr_invariant_in_loop_p (loop, op1))
4571 : : {
4572 : 35364 : add = op1;
4573 : 35364 : off = op0;
4574 : 35364 : goto do_add;
4575 : : }
4576 : : break;
4577 : 272 : case MINUS_EXPR:
4578 : 272 : if (expr_invariant_in_loop_p (loop, op1))
4579 : : {
4580 : 110 : add = fold_convert (sizetype, op1);
4581 : 110 : add = size_binop (MINUS_EXPR, size_zero_node, add);
4582 : 110 : off = op0;
4583 : 110 : goto do_add;
4584 : : }
4585 : : break;
4586 : 148658 : case MULT_EXPR:
4587 : 148658 : if (scale == 1 && tree_fits_shwi_p (op1))
4588 : : {
4589 : 121916 : int new_scale = tree_to_shwi (op1);
4590 : : /* Only treat this as a scaling operation if the target
4591 : : supports it for at least some offset type. */
4592 : 121916 : if (use_ifn_p
4593 : 0 : && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4594 : : masked_p, vectype, memory_type,
4595 : : signed_char_type_node,
4596 : : new_scale, &ifn,
4597 : : &offset_vectype,
4598 : : elsvals)
4599 : 121916 : && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4600 : : masked_p, vectype, memory_type,
4601 : : unsigned_char_type_node,
4602 : : new_scale, &ifn,
4603 : : &offset_vectype,
4604 : : elsvals))
4605 : : break;
4606 : 121916 : scale = new_scale;
4607 : 121916 : off = op0;
4608 : 121916 : continue;
4609 : 121916 : }
4610 : : break;
4611 : 0 : case SSA_NAME:
4612 : 0 : off = op0;
4613 : 0 : continue;
4614 : 141934 : CASE_CONVERT:
4615 : 283868 : if (!POINTER_TYPE_P (TREE_TYPE (op0))
4616 : 283868 : && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
4617 : : break;
4618 : :
4619 : : /* Don't include the conversion if the target is happy with
4620 : : the current offset type. */
4621 : 141934 : if (use_ifn_p
4622 : 0 : && TREE_CODE (off) == SSA_NAME
4623 : 0 : && !POINTER_TYPE_P (TREE_TYPE (off))
4624 : 141934 : && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
4625 : : masked_p, vectype, memory_type,
4626 : 0 : TREE_TYPE (off), scale, &ifn,
4627 : : &offset_vectype, elsvals))
4628 : : break;
4629 : :
4630 : 141934 : if (TYPE_PRECISION (TREE_TYPE (op0))
4631 : 141934 : == TYPE_PRECISION (TREE_TYPE (off)))
4632 : : {
4633 : 66260 : off = op0;
4634 : 66260 : continue;
4635 : : }
4636 : :
4637 : : /* Include the conversion if it is widening and we're using
4638 : : the IFN path or the target can handle the converted from
4639 : : offset or the current size is not already the same as the
4640 : : data vector element size. */
4641 : 75674 : if ((TYPE_PRECISION (TREE_TYPE (op0))
4642 : 75674 : < TYPE_PRECISION (TREE_TYPE (off)))
4643 : 75674 : && (use_ifn_p
4644 : 75624 : || (DR_IS_READ (dr)
4645 : 43993 : ? (targetm.vectorize.builtin_gather
4646 : 43993 : && targetm.vectorize.builtin_gather (vectype,
4647 : 43993 : TREE_TYPE (op0),
4648 : : scale))
4649 : 31631 : : (targetm.vectorize.builtin_scatter
4650 : 31631 : && targetm.vectorize.builtin_scatter (vectype,
4651 : 31631 : TREE_TYPE (op0),
4652 : : scale)))
4653 : 74389 : || !operand_equal_p (TYPE_SIZE (TREE_TYPE (off)),
4654 : 74389 : TYPE_SIZE (TREE_TYPE (vectype)), 0)))
4655 : : {
4656 : 72370 : off = op0;
4657 : 72370 : offtype = TREE_TYPE (off);
4658 : 72370 : STRIP_NOPS (off);
4659 : 72370 : continue;
4660 : : }
4661 : : break;
4662 : : default:
4663 : : break;
4664 : 0 : }
4665 : : break;
4666 : : }
4667 : :
4668 : : /* If at the end OFF still isn't a SSA_NAME or isn't
4669 : : defined in the loop, punt. */
4670 : 161875 : if (TREE_CODE (off) != SSA_NAME
4671 : 161875 : || expr_invariant_in_loop_p (loop, off))
4672 : 3850 : return false;
4673 : :
4674 : 158025 : if (offtype == NULL_TREE)
4675 : 85733 : offtype = TREE_TYPE (off);
4676 : :
4677 : 158025 : if (use_ifn_p)
4678 : : {
4679 : 0 : if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4680 : : vectype, memory_type, offtype, scale,
4681 : : &ifn, &offset_vectype, elsvals))
4682 : 0 : ifn = IFN_LAST;
4683 : : decl = NULL_TREE;
4684 : : }
4685 : : else
4686 : : {
4687 : 158025 : if (DR_IS_READ (dr))
4688 : : {
4689 : 90963 : if (targetm.vectorize.builtin_gather)
4690 : 90963 : decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4691 : : }
4692 : : else
4693 : : {
4694 : 67062 : if (targetm.vectorize.builtin_scatter)
4695 : 67062 : decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4696 : : }
4697 : 158025 : ifn = IFN_LAST;
4698 : : /* The offset vector type will be read from DECL when needed. */
4699 : 158025 : offset_vectype = NULL_TREE;
4700 : : }
4701 : :
4702 : 158025 : info->ifn = ifn;
4703 : 158025 : info->decl = decl;
4704 : 158025 : info->base = base;
4705 : 158025 : info->offset = off;
4706 : 158025 : info->offset_dt = vect_unknown_def_type;
4707 : 158025 : info->offset_vectype = offset_vectype;
4708 : 158025 : info->scale = scale;
4709 : 158025 : info->element_type = TREE_TYPE (vectype);
4710 : 158025 : info->memory_type = memory_type;
4711 : 158025 : return true;
4712 : : }
4713 : :
4714 : : /* Find the data references in STMT, analyze them with respect to LOOP and
4715 : : append them to DATAREFS. Return false if datarefs in this stmt cannot
4716 : : be handled. */
4717 : :
4718 : : opt_result
4719 : 30142299 : vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4720 : : vec<data_reference_p> *datarefs,
4721 : : vec<int> *dataref_groups, int group_id)
4722 : : {
4723 : : /* We can ignore clobbers for dataref analysis - they are removed during
4724 : : loop vectorization and BB vectorization checks dependences with a
4725 : : stmt walk. */
4726 : 30142299 : if (gimple_clobber_p (stmt))
4727 : 1195325 : return opt_result::success ();
4728 : :
4729 : 53936264 : if (gimple_has_volatile_ops (stmt))
4730 : 313929 : return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4731 : : stmt);
4732 : :
4733 : 28633045 : if (stmt_can_throw_internal (cfun, stmt))
4734 : 764514 : return opt_result::failure_at (stmt,
4735 : : "not vectorized:"
4736 : : " statement can throw an exception: %G",
4737 : : stmt);
4738 : :
4739 : 27868531 : auto_vec<data_reference_p, 2> refs;
4740 : 27868531 : opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4741 : 27868531 : if (!res)
4742 : 3416552 : return res;
4743 : :
4744 : 24451979 : if (refs.is_empty ())
4745 : 13936668 : return opt_result::success ();
4746 : :
4747 : 10515311 : if (refs.length () > 1)
4748 : : {
4749 : 1195887 : while (!refs.is_empty ())
4750 : 797555 : free_data_ref (refs.pop ());
4751 : 398332 : return opt_result::failure_at (stmt,
4752 : : "not vectorized: more than one "
4753 : : "data ref in stmt: %G", stmt);
4754 : : }
4755 : :
4756 : 10116979 : data_reference_p dr = refs.pop ();
4757 : 10116979 : if (gcall *call = dyn_cast <gcall *> (stmt))
4758 : 5253 : if (!gimple_call_internal_p (call)
4759 : 5253 : || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4760 : 1339 : && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4761 : : {
4762 : 2306 : free_data_ref (dr);
4763 : 2306 : return opt_result::failure_at (stmt,
4764 : : "not vectorized: dr in a call %G", stmt);
4765 : : }
4766 : :
4767 : 10114673 : if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4768 : 10114673 : && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4769 : : {
4770 : 47661 : free_data_ref (dr);
4771 : 47661 : return opt_result::failure_at (stmt,
4772 : : "not vectorized:"
4773 : : " statement is an unsupported"
4774 : : " bitfield access %G", stmt);
4775 : : }
4776 : :
4777 : 10067012 : if (DR_BASE_ADDRESS (dr)
4778 : 10033126 : && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4779 : : {
4780 : 877 : free_data_ref (dr);
4781 : 877 : return opt_result::failure_at (stmt,
4782 : : "not vectorized:"
4783 : : " base addr of dr is a constant\n");
4784 : : }
4785 : :
4786 : : /* Check whether this may be a SIMD lane access and adjust the
4787 : : DR to make it easier for us to handle it. */
4788 : 10066135 : if (loop
4789 : 454104 : && loop->simduid
4790 : 10714 : && (!DR_BASE_ADDRESS (dr)
4791 : 2963 : || !DR_OFFSET (dr)
4792 : 2963 : || !DR_INIT (dr)
4793 : 2963 : || !DR_STEP (dr)))
4794 : : {
4795 : 7751 : struct data_reference *newdr
4796 : 7751 : = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4797 : 7751 : DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4798 : 7751 : if (DR_BASE_ADDRESS (newdr)
4799 : 7751 : && DR_OFFSET (newdr)
4800 : 7751 : && DR_INIT (newdr)
4801 : 7751 : && DR_STEP (newdr)
4802 : 7751 : && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4803 : 15502 : && integer_zerop (DR_STEP (newdr)))
4804 : : {
4805 : 7751 : tree base_address = DR_BASE_ADDRESS (newdr);
4806 : 7751 : tree off = DR_OFFSET (newdr);
4807 : 7751 : tree step = ssize_int (1);
4808 : 7751 : if (integer_zerop (off)
4809 : 7751 : && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4810 : : {
4811 : 89 : off = TREE_OPERAND (base_address, 1);
4812 : 89 : base_address = TREE_OPERAND (base_address, 0);
4813 : : }
4814 : 7751 : STRIP_NOPS (off);
4815 : 7751 : if (TREE_CODE (off) == MULT_EXPR
4816 : 7751 : && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4817 : : {
4818 : 7501 : step = TREE_OPERAND (off, 1);
4819 : 7501 : off = TREE_OPERAND (off, 0);
4820 : 7501 : STRIP_NOPS (off);
4821 : : }
4822 : 544 : if (CONVERT_EXPR_P (off)
4823 : 7751 : && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4824 : 7207 : < TYPE_PRECISION (TREE_TYPE (off))))
4825 : 7207 : off = TREE_OPERAND (off, 0);
4826 : 7751 : if (TREE_CODE (off) == SSA_NAME)
4827 : : {
4828 : 7223 : gimple *def = SSA_NAME_DEF_STMT (off);
4829 : : /* Look through widening conversion. */
4830 : 7223 : if (is_gimple_assign (def)
4831 : 7223 : && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4832 : : {
4833 : 0 : tree rhs1 = gimple_assign_rhs1 (def);
4834 : 0 : if (TREE_CODE (rhs1) == SSA_NAME
4835 : 0 : && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4836 : 0 : && (TYPE_PRECISION (TREE_TYPE (off))
4837 : 0 : > TYPE_PRECISION (TREE_TYPE (rhs1))))
4838 : 0 : def = SSA_NAME_DEF_STMT (rhs1);
4839 : : }
4840 : 7223 : if (is_gimple_call (def)
4841 : 7084 : && gimple_call_internal_p (def)
4842 : 14307 : && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4843 : : {
4844 : 7084 : tree arg = gimple_call_arg (def, 0);
4845 : 7084 : tree reft = TREE_TYPE (DR_REF (newdr));
4846 : 7084 : gcc_assert (TREE_CODE (arg) == SSA_NAME);
4847 : 7084 : arg = SSA_NAME_VAR (arg);
4848 : 7084 : if (arg == loop->simduid
4849 : : /* For now. */
4850 : 7084 : && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4851 : : {
4852 : 7059 : DR_BASE_ADDRESS (newdr) = base_address;
4853 : 7059 : DR_OFFSET (newdr) = ssize_int (0);
4854 : 7059 : DR_STEP (newdr) = step;
4855 : 7059 : DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4856 : 7059 : DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4857 : : /* Mark as simd-lane access. */
4858 : 7059 : tree arg2 = gimple_call_arg (def, 1);
4859 : 7059 : newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4860 : 7059 : free_data_ref (dr);
4861 : 7059 : datarefs->safe_push (newdr);
4862 : 7059 : if (dataref_groups)
4863 : 0 : dataref_groups->safe_push (group_id);
4864 : 7059 : return opt_result::success ();
4865 : : }
4866 : : }
4867 : : }
4868 : : }
4869 : 692 : free_data_ref (newdr);
4870 : : }
4871 : :
4872 : 10059076 : datarefs->safe_push (dr);
4873 : 10059076 : if (dataref_groups)
4874 : 9612031 : dataref_groups->safe_push (group_id);
4875 : 10059076 : return opt_result::success ();
4876 : 27868531 : }
4877 : :
4878 : : /* Function vect_analyze_data_refs.
4879 : :
4880 : : Find all the data references in the loop or basic block.
4881 : :
4882 : : The general structure of the analysis of data refs in the vectorizer is as
4883 : : follows:
4884 : : 1- vect_analyze_data_refs(loop/bb): call
4885 : : compute_data_dependences_for_loop/bb to find and analyze all data-refs
4886 : : in the loop/bb and their dependences.
4887 : : 2- vect_analyze_dependences(): apply dependence testing using ddrs.
4888 : : 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4889 : : 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4890 : :
4891 : : */
4892 : :
4893 : : opt_result
4894 : 2659054 : vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4895 : : {
4896 : 2659054 : class loop *loop = NULL;
4897 : 2659054 : unsigned int i;
4898 : 2659054 : struct data_reference *dr;
4899 : 2659054 : tree scalar_type;
4900 : :
4901 : 2659054 : DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4902 : :
4903 : 2659054 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4904 : 349047 : loop = LOOP_VINFO_LOOP (loop_vinfo);
4905 : :
4906 : : /* Go through the data-refs, check that the analysis succeeded. Update
4907 : : pointer from stmt_vec_info struct to DR and vectype. */
4908 : :
4909 : 2659054 : vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4910 : 16496558 : FOR_EACH_VEC_ELT (datarefs, i, dr)
4911 : : {
4912 : 13887490 : enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4913 : 13887490 : poly_uint64 vf;
4914 : :
4915 : 13887490 : gcc_assert (DR_REF (dr));
4916 : 13887490 : stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4917 : 13887490 : gcc_assert (!stmt_info->dr_aux.dr);
4918 : 13887490 : stmt_info->dr_aux.dr = dr;
4919 : 13887490 : stmt_info->dr_aux.stmt = stmt_info;
4920 : :
4921 : : /* Check that analysis of the data-ref succeeded. */
4922 : 13887490 : if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4923 : 13844813 : || !DR_STEP (dr))
4924 : : {
4925 : 85354 : bool maybe_gather
4926 : 42677 : = DR_IS_READ (dr)
4927 : 42677 : && !TREE_THIS_VOLATILE (DR_REF (dr));
4928 : 85354 : bool maybe_scatter
4929 : : = DR_IS_WRITE (dr)
4930 : 42677 : && !TREE_THIS_VOLATILE (DR_REF (dr));
4931 : :
4932 : : /* If target supports vector gather loads or scatter stores,
4933 : : see if they can't be used. */
4934 : 42677 : if (is_a <loop_vec_info> (vinfo)
4935 : 42677 : && !nested_in_vect_loop_p (loop, stmt_info))
4936 : : {
4937 : 39160 : if (maybe_gather || maybe_scatter)
4938 : : {
4939 : 39160 : if (maybe_gather)
4940 : : gatherscatter = GATHER;
4941 : : else
4942 : 12523 : gatherscatter = SCATTER;
4943 : : }
4944 : : }
4945 : :
4946 : 12523 : if (gatherscatter == SG_NONE)
4947 : : {
4948 : 3517 : if (dump_enabled_p ())
4949 : 5 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4950 : : "not vectorized: data ref analysis "
4951 : : "failed %G", stmt_info->stmt);
4952 : 3517 : if (is_a <bb_vec_info> (vinfo))
4953 : : {
4954 : : /* In BB vectorization the ref can still participate
4955 : : in dependence analysis, we just can't vectorize it. */
4956 : 2903 : STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4957 : 1573517 : continue;
4958 : : }
4959 : 614 : return opt_result::failure_at (stmt_info->stmt,
4960 : : "not vectorized:"
4961 : : " data ref analysis failed: %G",
4962 : : stmt_info->stmt);
4963 : : }
4964 : : }
4965 : :
4966 : : /* See if this was detected as SIMD lane access. */
4967 : 13883973 : if (dr->aux == (void *)-1
4968 : 13883973 : || dr->aux == (void *)-2
4969 : 13875030 : || dr->aux == (void *)-3
4970 : 13874190 : || dr->aux == (void *)-4)
4971 : : {
4972 : 10583 : if (nested_in_vect_loop_p (loop, stmt_info))
4973 : 0 : return opt_result::failure_at (stmt_info->stmt,
4974 : : "not vectorized:"
4975 : : " data ref analysis failed: %G",
4976 : : stmt_info->stmt);
4977 : 10583 : STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4978 : 10583 : = -(uintptr_t) dr->aux;
4979 : : }
4980 : :
4981 : 13883973 : tree base = get_base_address (DR_REF (dr));
4982 : 13883973 : if (base && VAR_P (base) && DECL_NONALIASED (base))
4983 : : {
4984 : 7399 : if (dump_enabled_p ())
4985 : 186 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4986 : : "not vectorized: base object not addressable "
4987 : : "for stmt: %G", stmt_info->stmt);
4988 : 7399 : if (is_a <bb_vec_info> (vinfo))
4989 : : {
4990 : : /* In BB vectorization the ref can still participate
4991 : : in dependence analysis, we just can't vectorize it. */
4992 : 7399 : STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4993 : 7399 : continue;
4994 : : }
4995 : 0 : return opt_result::failure_at (stmt_info->stmt,
4996 : : "not vectorized: base object not"
4997 : : " addressable for stmt: %G",
4998 : : stmt_info->stmt);
4999 : : }
5000 : :
5001 : 13876574 : if (is_a <loop_vec_info> (vinfo)
5002 : 813315 : && DR_STEP (dr)
5003 : 14650729 : && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
5004 : : {
5005 : 40868 : if (nested_in_vect_loop_p (loop, stmt_info))
5006 : 350 : return opt_result::failure_at (stmt_info->stmt,
5007 : : "not vectorized: "
5008 : : "not suitable for strided load %G",
5009 : : stmt_info->stmt);
5010 : 40518 : STMT_VINFO_STRIDED_P (stmt_info) = true;
5011 : : }
5012 : :
5013 : : /* Update DR field in stmt_vec_info struct. */
5014 : :
5015 : : /* If the dataref is in an inner-loop of the loop that is considered for
5016 : : for vectorization, we also want to analyze the access relative to
5017 : : the outer-loop (DR contains information only relative to the
5018 : : inner-most enclosing loop). We do that by building a reference to the
5019 : : first location accessed by the inner-loop, and analyze it relative to
5020 : : the outer-loop. */
5021 : 13876224 : if (loop && nested_in_vect_loop_p (loop, stmt_info))
5022 : : {
5023 : : /* Build a reference to the first location accessed by the
5024 : : inner loop: *(BASE + INIT + OFFSET). By construction,
5025 : : this address must be invariant in the inner loop, so we
5026 : : can consider it as being used in the outer loop. */
5027 : 11019 : tree base = unshare_expr (DR_BASE_ADDRESS (dr));
5028 : 11019 : tree offset = unshare_expr (DR_OFFSET (dr));
5029 : 11019 : tree init = unshare_expr (DR_INIT (dr));
5030 : 11019 : tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
5031 : : init, offset);
5032 : 11019 : tree init_addr = fold_build_pointer_plus (base, init_offset);
5033 : 11019 : tree init_ref = build_fold_indirect_ref (init_addr);
5034 : :
5035 : 11019 : if (dump_enabled_p ())
5036 : 1142 : dump_printf_loc (MSG_NOTE, vect_location,
5037 : : "analyze in outer loop: %T\n", init_ref);
5038 : :
5039 : 11019 : opt_result res
5040 : 11019 : = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
5041 : 11019 : init_ref, loop, stmt_info->stmt);
5042 : 11019 : if (!res)
5043 : : /* dr_analyze_innermost already explained the failure. */
5044 : 154 : return res;
5045 : :
5046 : 10865 : if (dump_enabled_p ())
5047 : 1142 : dump_printf_loc (MSG_NOTE, vect_location,
5048 : : "\touter base_address: %T\n"
5049 : : "\touter offset from base address: %T\n"
5050 : : "\touter constant offset from base address: %T\n"
5051 : : "\touter step: %T\n"
5052 : : "\touter base alignment: %d\n\n"
5053 : : "\touter base misalignment: %d\n"
5054 : : "\touter offset alignment: %d\n"
5055 : : "\touter step alignment: %d\n",
5056 : : STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
5057 : : STMT_VINFO_DR_OFFSET (stmt_info),
5058 : : STMT_VINFO_DR_INIT (stmt_info),
5059 : : STMT_VINFO_DR_STEP (stmt_info),
5060 : : STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
5061 : : STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
5062 : : STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
5063 : : STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
5064 : : }
5065 : :
5066 : : /* Set vectype for STMT. */
5067 : 13876070 : scalar_type = TREE_TYPE (DR_REF (dr));
5068 : 13876070 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
5069 : 13876070 : if (!vectype)
5070 : : {
5071 : 1607795 : if (dump_enabled_p ())
5072 : : {
5073 : 1779 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5074 : : "not vectorized: no vectype for stmt: %G",
5075 : : stmt_info->stmt);
5076 : 1779 : dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
5077 : 1779 : dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
5078 : : scalar_type);
5079 : 1779 : dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
5080 : : }
5081 : :
5082 : 1607795 : if (is_a <bb_vec_info> (vinfo))
5083 : : {
5084 : : /* No vector type is fine, the ref can still participate
5085 : : in dependence analysis, we just can't vectorize it. */
5086 : 1563215 : STMT_VINFO_VECTORIZABLE (stmt_info) = false;
5087 : 1563215 : continue;
5088 : : }
5089 : 44580 : if (fatal)
5090 : 44580 : *fatal = false;
5091 : 44580 : return opt_result::failure_at (stmt_info->stmt,
5092 : : "not vectorized:"
5093 : : " no vectype for stmt: %G"
5094 : : " scalar_type: %T\n",
5095 : : stmt_info->stmt, scalar_type);
5096 : : }
5097 : : else
5098 : : {
5099 : 12268275 : if (dump_enabled_p ())
5100 : 81705 : dump_printf_loc (MSG_NOTE, vect_location,
5101 : : "got vectype for stmt: %G%T\n",
5102 : : stmt_info->stmt, vectype);
5103 : : }
5104 : :
5105 : : /* Adjust the minimal vectorization factor according to the
5106 : : vector type. */
5107 : 12268275 : vf = TYPE_VECTOR_SUBPARTS (vectype);
5108 : 12268275 : *min_vf = upper_bound (*min_vf, vf);
5109 : :
5110 : : /* Leave the BB vectorizer to pick the vector type later, based on
5111 : : the final dataref group size and SLP node size. */
5112 : 12268275 : if (is_a <loop_vec_info> (vinfo))
5113 : 768231 : STMT_VINFO_VECTYPE (stmt_info) = vectype;
5114 : :
5115 : 12268275 : if (gatherscatter != SG_NONE)
5116 : : {
5117 : 36726 : gather_scatter_info gs_info;
5118 : 36726 : if (!vect_check_gather_scatter (stmt_info,
5119 : : as_a <loop_vec_info> (vinfo),
5120 : : &gs_info)
5121 : 69892 : || !get_vectype_for_scalar_type (vinfo,
5122 : 33166 : TREE_TYPE (gs_info.offset)))
5123 : : {
5124 : 4288 : if (fatal)
5125 : 4288 : *fatal = false;
5126 : 4288 : return opt_result::failure_at
5127 : 4948 : (stmt_info->stmt,
5128 : : (gatherscatter == GATHER)
5129 : : ? "not vectorized: not suitable for gather load %G"
5130 : : : "not vectorized: not suitable for scatter store %G",
5131 : : stmt_info->stmt);
5132 : : }
5133 : 32438 : STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
5134 : : }
5135 : : }
5136 : :
5137 : : /* We used to stop processing and prune the list here. Verify we no
5138 : : longer need to. */
5139 : 4041601 : gcc_assert (i == datarefs.length ());
5140 : :
5141 : 2609068 : return opt_result::success ();
5142 : : }
5143 : :
5144 : :
5145 : : /* Function vect_get_new_vect_var.
5146 : :
5147 : : Returns a name for a new variable. The current naming scheme appends the
5148 : : prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
5149 : : the name of vectorizer generated variables, and appends that to NAME if
5150 : : provided. */
5151 : :
5152 : : tree
5153 : 1817872 : vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
5154 : : {
5155 : 1817872 : const char *prefix;
5156 : 1817872 : tree new_vect_var;
5157 : :
5158 : 1817872 : switch (var_kind)
5159 : : {
5160 : : case vect_simple_var:
5161 : : prefix = "vect";
5162 : : break;
5163 : 21762 : case vect_scalar_var:
5164 : 21762 : prefix = "stmp";
5165 : 21762 : break;
5166 : 13107 : case vect_mask_var:
5167 : 13107 : prefix = "mask";
5168 : 13107 : break;
5169 : 1341918 : case vect_pointer_var:
5170 : 1341918 : prefix = "vectp";
5171 : 1341918 : break;
5172 : 0 : default:
5173 : 0 : gcc_unreachable ();
5174 : : }
5175 : :
5176 : 1817872 : if (name)
5177 : : {
5178 : 1007550 : char* tmp = concat (prefix, "_", name, NULL);
5179 : 1007550 : new_vect_var = create_tmp_reg (type, tmp);
5180 : 1007550 : free (tmp);
5181 : : }
5182 : : else
5183 : 810322 : new_vect_var = create_tmp_reg (type, prefix);
5184 : :
5185 : 1817872 : return new_vect_var;
5186 : : }
5187 : :
5188 : : /* Like vect_get_new_vect_var but return an SSA name. */
5189 : :
5190 : : tree
5191 : 5949 : vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
5192 : : {
5193 : 5949 : const char *prefix;
5194 : 5949 : tree new_vect_var;
5195 : :
5196 : 5949 : switch (var_kind)
5197 : : {
5198 : : case vect_simple_var:
5199 : : prefix = "vect";
5200 : : break;
5201 : 313 : case vect_scalar_var:
5202 : 313 : prefix = "stmp";
5203 : 313 : break;
5204 : 0 : case vect_pointer_var:
5205 : 0 : prefix = "vectp";
5206 : 0 : break;
5207 : 0 : default:
5208 : 0 : gcc_unreachable ();
5209 : : }
5210 : :
5211 : 5949 : if (name)
5212 : : {
5213 : 5463 : char* tmp = concat (prefix, "_", name, NULL);
5214 : 5463 : new_vect_var = make_temp_ssa_name (type, NULL, tmp);
5215 : 5463 : free (tmp);
5216 : : }
5217 : : else
5218 : 486 : new_vect_var = make_temp_ssa_name (type, NULL, prefix);
5219 : :
5220 : 5949 : return new_vect_var;
5221 : : }
5222 : :
5223 : : /* Duplicate points-to info on NAME from DR_INFO. */
5224 : :
5225 : : static void
5226 : 261041 : vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
5227 : : {
5228 : 261041 : duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
5229 : : /* DR_PTR_INFO is for a base SSA name, not including constant or
5230 : : variable offsets in the ref so its alignment info does not apply. */
5231 : 261041 : mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
5232 : 261041 : }
5233 : :
5234 : : /* Function vect_create_addr_base_for_vector_ref.
5235 : :
5236 : : Create an expression that computes the address of the first memory location
5237 : : that will be accessed for a data reference.
5238 : :
5239 : : Input:
5240 : : STMT_INFO: The statement containing the data reference.
5241 : : NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
5242 : : OFFSET: Optional. If supplied, it is be added to the initial address.
5243 : : LOOP: Specify relative to which loop-nest should the address be computed.
5244 : : For example, when the dataref is in an inner-loop nested in an
5245 : : outer-loop that is now being vectorized, LOOP can be either the
5246 : : outer-loop, or the inner-loop. The first memory location accessed
5247 : : by the following dataref ('in' points to short):
5248 : :
5249 : : for (i=0; i<N; i++)
5250 : : for (j=0; j<M; j++)
5251 : : s += in[i+j]
5252 : :
5253 : : is as follows:
5254 : : if LOOP=i_loop: &in (relative to i_loop)
5255 : : if LOOP=j_loop: &in+i*2B (relative to j_loop)
5256 : :
5257 : : Output:
5258 : : 1. Return an SSA_NAME whose value is the address of the memory location of
5259 : : the first vector of the data reference.
5260 : : 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
5261 : : these statement(s) which define the returned SSA_NAME.
5262 : :
5263 : : FORNOW: We are only handling array accesses with step 1. */
5264 : :
5265 : : tree
5266 : 671071 : vect_create_addr_base_for_vector_ref (vec_info *vinfo, stmt_vec_info stmt_info,
5267 : : gimple_seq *new_stmt_list,
5268 : : tree offset)
5269 : : {
5270 : 671071 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5271 : 671071 : struct data_reference *dr = dr_info->dr;
5272 : 671071 : const char *base_name;
5273 : 671071 : tree addr_base;
5274 : 671071 : tree dest;
5275 : 671071 : gimple_seq seq = NULL;
5276 : 671071 : tree vect_ptr_type;
5277 : 671071 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5278 : 671071 : innermost_loop_behavior *drb = vect_dr_behavior (vinfo, dr_info);
5279 : :
5280 : 671071 : tree data_ref_base = unshare_expr (drb->base_address);
5281 : 671071 : tree base_offset = unshare_expr (get_dr_vinfo_offset (vinfo, dr_info, true));
5282 : 671071 : tree init = unshare_expr (drb->init);
5283 : :
5284 : 671071 : if (loop_vinfo)
5285 : 116745 : base_name = get_name (data_ref_base);
5286 : : else
5287 : : {
5288 : 554326 : base_offset = ssize_int (0);
5289 : 554326 : init = ssize_int (0);
5290 : 554326 : base_name = get_name (DR_REF (dr));
5291 : : }
5292 : :
5293 : : /* Create base_offset */
5294 : 671071 : base_offset = size_binop (PLUS_EXPR,
5295 : : fold_convert (sizetype, base_offset),
5296 : : fold_convert (sizetype, init));
5297 : :
5298 : 671071 : if (offset)
5299 : : {
5300 : 3028 : offset = fold_convert (sizetype, offset);
5301 : 3028 : base_offset = fold_build2 (PLUS_EXPR, sizetype,
5302 : : base_offset, offset);
5303 : : }
5304 : :
5305 : : /* base + base_offset */
5306 : 671071 : if (loop_vinfo)
5307 : 116745 : addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
5308 : : else
5309 : 1108652 : addr_base = build1 (ADDR_EXPR,
5310 : 554326 : build_pointer_type (TREE_TYPE (DR_REF (dr))),
5311 : : /* Strip zero offset components since we don't need
5312 : : them and they can confuse late diagnostics if
5313 : : we CSE them wrongly. See PR106904 for example. */
5314 : : unshare_expr (strip_zero_offset_components
5315 : : (DR_REF (dr))));
5316 : :
5317 : 671071 : vect_ptr_type = build_pointer_type (TREE_TYPE (DR_REF (dr)));
5318 : 671071 : dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
5319 : 671071 : addr_base = force_gimple_operand (addr_base, &seq, true, dest);
5320 : 671071 : gimple_seq_add_seq (new_stmt_list, seq);
5321 : :
5322 : 671071 : if (DR_PTR_INFO (dr)
5323 : 161478 : && TREE_CODE (addr_base) == SSA_NAME
5324 : : /* We should only duplicate pointer info to newly created SSA names. */
5325 : 832105 : && SSA_NAME_VAR (addr_base) == dest)
5326 : : {
5327 : 136293 : gcc_assert (!SSA_NAME_PTR_INFO (addr_base));
5328 : 136293 : vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
5329 : : }
5330 : :
5331 : 671071 : if (dump_enabled_p ())
5332 : 23818 : dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
5333 : :
5334 : 671071 : return addr_base;
5335 : : }
5336 : :
5337 : :
5338 : : /* Function vect_create_data_ref_ptr.
5339 : :
5340 : : Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
5341 : : location accessed in the loop by STMT_INFO, along with the def-use update
5342 : : chain to appropriately advance the pointer through the loop iterations.
5343 : : Also set aliasing information for the pointer. This pointer is used by
5344 : : the callers to this function to create a memory reference expression for
5345 : : vector load/store access.
5346 : :
5347 : : Input:
5348 : : 1. STMT_INFO: a stmt that references memory. Expected to be of the form
5349 : : GIMPLE_ASSIGN <name, data-ref> or
5350 : : GIMPLE_ASSIGN <data-ref, name>.
5351 : : 2. AGGR_TYPE: the type of the reference, which should be either a vector
5352 : : or an array.
5353 : : 3. AT_LOOP: the loop where the vector memref is to be created.
5354 : : 4. OFFSET (optional): a byte offset to be added to the initial address
5355 : : accessed by the data-ref in STMT_INFO.
5356 : : 5. BSI: location where the new stmts are to be placed if there is no loop
5357 : : 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
5358 : : pointing to the initial address.
5359 : : 8. IV_STEP (optional, defaults to NULL): the amount that should be added
5360 : : to the IV during each iteration of the loop. NULL says to move
5361 : : by one copy of AGGR_TYPE up or down, depending on the step of the
5362 : : data reference.
5363 : :
5364 : : Output:
5365 : : 1. Declare a new ptr to vector_type, and have it point to the base of the
5366 : : data reference (initial addressed accessed by the data reference).
5367 : : For example, for vector of type V8HI, the following code is generated:
5368 : :
5369 : : v8hi *ap;
5370 : : ap = (v8hi *)initial_address;
5371 : :
5372 : : if OFFSET is not supplied:
5373 : : initial_address = &a[init];
5374 : : if OFFSET is supplied:
5375 : : initial_address = &a[init] + OFFSET;
5376 : : if BYTE_OFFSET is supplied:
5377 : : initial_address = &a[init] + BYTE_OFFSET;
5378 : :
5379 : : Return the initial_address in INITIAL_ADDRESS.
5380 : :
5381 : : 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
5382 : : update the pointer in each iteration of the loop.
5383 : :
5384 : : Return the increment stmt that updates the pointer in PTR_INCR.
5385 : :
5386 : : 3. Return the pointer. */
5387 : :
5388 : : tree
5389 : 670847 : vect_create_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
5390 : : tree aggr_type, class loop *at_loop, tree offset,
5391 : : tree *initial_address, gimple_stmt_iterator *gsi,
5392 : : gimple **ptr_incr, bool only_init,
5393 : : tree iv_step)
5394 : : {
5395 : 670847 : const char *base_name;
5396 : 670847 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5397 : 670847 : class loop *loop = NULL;
5398 : 670847 : bool nested_in_vect_loop = false;
5399 : 670847 : class loop *containing_loop = NULL;
5400 : 670847 : tree aggr_ptr_type;
5401 : 670847 : tree aggr_ptr;
5402 : 670847 : tree new_temp;
5403 : 670847 : gimple_seq new_stmt_list = NULL;
5404 : 670847 : edge pe = NULL;
5405 : 670847 : basic_block new_bb;
5406 : 670847 : tree aggr_ptr_init;
5407 : 670847 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5408 : 670847 : struct data_reference *dr = dr_info->dr;
5409 : 670847 : tree aptr;
5410 : 670847 : gimple_stmt_iterator incr_gsi;
5411 : 670847 : bool insert_after;
5412 : 670847 : tree indx_before_incr, indx_after_incr;
5413 : 670847 : gimple *incr;
5414 : 670847 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5415 : :
5416 : 670847 : gcc_assert (iv_step != NULL_TREE
5417 : : || TREE_CODE (aggr_type) == ARRAY_TYPE
5418 : : || TREE_CODE (aggr_type) == VECTOR_TYPE);
5419 : :
5420 : 670847 : if (loop_vinfo)
5421 : : {
5422 : 116521 : loop = LOOP_VINFO_LOOP (loop_vinfo);
5423 : 116521 : nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5424 : 116521 : containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5425 : 116521 : pe = loop_preheader_edge (loop);
5426 : : }
5427 : : else
5428 : : {
5429 : 554326 : gcc_assert (bb_vinfo);
5430 : 554326 : only_init = true;
5431 : 554326 : *ptr_incr = NULL;
5432 : : }
5433 : :
5434 : : /* Create an expression for the first address accessed by this load
5435 : : in LOOP. */
5436 : 670847 : base_name = get_name (DR_BASE_ADDRESS (dr));
5437 : :
5438 : 670847 : if (dump_enabled_p ())
5439 : : {
5440 : 23729 : tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
5441 : 23729 : dump_printf_loc (MSG_NOTE, vect_location,
5442 : : "create %s-pointer variable to type: %T",
5443 : 23729 : get_tree_code_name (TREE_CODE (aggr_type)),
5444 : : aggr_type);
5445 : 23729 : if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
5446 : 13315 : dump_printf (MSG_NOTE, " vectorizing an array ref: ");
5447 : 10414 : else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
5448 : 0 : dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
5449 : 10414 : else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
5450 : 1537 : dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
5451 : : else
5452 : 8877 : dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
5453 : 23729 : dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
5454 : : }
5455 : :
5456 : : /* (1) Create the new aggregate-pointer variable.
5457 : : Vector and array types inherit the alias set of their component
5458 : : type by default so we need to use a ref-all pointer if the data
5459 : : reference does not conflict with the created aggregated data
5460 : : reference because it is not addressable. */
5461 : 670847 : bool need_ref_all = false;
5462 : 670847 : if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5463 : : get_alias_set (DR_REF (dr))))
5464 : : need_ref_all = true;
5465 : : /* Likewise for any of the data references in the stmt group. */
5466 : 567524 : else if (DR_GROUP_SIZE (stmt_info) > 1)
5467 : : {
5468 : 464482 : stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
5469 : 1263068 : do
5470 : : {
5471 : 1263068 : struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
5472 : 1263068 : if (!alias_sets_conflict_p (get_alias_set (aggr_type),
5473 : : get_alias_set (DR_REF (sdr))))
5474 : : {
5475 : : need_ref_all = true;
5476 : : break;
5477 : : }
5478 : 1262040 : sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
5479 : : }
5480 : 1262040 : while (sinfo);
5481 : : }
5482 : 670847 : aggr_ptr_type = build_pointer_type_for_mode (aggr_type, VOIDmode,
5483 : : need_ref_all);
5484 : 670847 : aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
5485 : :
5486 : :
5487 : : /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
5488 : : vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
5489 : : def-use update cycles for the pointer: one relative to the outer-loop
5490 : : (LOOP), which is what steps (3) and (4) below do. The other is relative
5491 : : to the inner-loop (which is the inner-most loop containing the dataref),
5492 : : and this is done be step (5) below.
5493 : :
5494 : : When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
5495 : : inner-most loop, and so steps (3),(4) work the same, and step (5) is
5496 : : redundant. Steps (3),(4) create the following:
5497 : :
5498 : : vp0 = &base_addr;
5499 : : LOOP: vp1 = phi(vp0,vp2)
5500 : : ...
5501 : : ...
5502 : : vp2 = vp1 + step
5503 : : goto LOOP
5504 : :
5505 : : If there is an inner-loop nested in loop, then step (5) will also be
5506 : : applied, and an additional update in the inner-loop will be created:
5507 : :
5508 : : vp0 = &base_addr;
5509 : : LOOP: vp1 = phi(vp0,vp2)
5510 : : ...
5511 : : inner: vp3 = phi(vp1,vp4)
5512 : : vp4 = vp3 + inner_step
5513 : : if () goto inner
5514 : : ...
5515 : : vp2 = vp1 + step
5516 : : if () goto LOOP */
5517 : :
5518 : : /* (2) Calculate the initial address of the aggregate-pointer, and set
5519 : : the aggregate-pointer to point to it before the loop. */
5520 : :
5521 : : /* Create: (&(base[init_val]+offset) in the loop preheader. */
5522 : :
5523 : 670847 : new_temp = vect_create_addr_base_for_vector_ref (vinfo,
5524 : : stmt_info, &new_stmt_list,
5525 : : offset);
5526 : 670847 : if (new_stmt_list)
5527 : : {
5528 : 154712 : if (pe)
5529 : : {
5530 : 51969 : new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
5531 : 51969 : gcc_assert (!new_bb);
5532 : : }
5533 : : else
5534 : 102743 : gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
5535 : : }
5536 : :
5537 : 670847 : *initial_address = new_temp;
5538 : 670847 : aggr_ptr_init = new_temp;
5539 : :
5540 : : /* (3) Handle the updating of the aggregate-pointer inside the loop.
5541 : : This is needed when ONLY_INIT is false, and also when AT_LOOP is the
5542 : : inner-loop nested in LOOP (during outer-loop vectorization). */
5543 : :
5544 : : /* No update in loop is required. */
5545 : 670847 : if (only_init && (!loop_vinfo || at_loop == loop))
5546 : : aptr = aggr_ptr_init;
5547 : : else
5548 : : {
5549 : : /* Accesses to invariant addresses should be handled specially
5550 : : by the caller. */
5551 : 116513 : tree step = vect_dr_behavior (vinfo, dr_info)->step;
5552 : 116513 : gcc_assert (!integer_zerop (step));
5553 : :
5554 : 116513 : if (iv_step == NULL_TREE)
5555 : : {
5556 : : /* The step of the aggregate pointer is the type size,
5557 : : negated for downward accesses. */
5558 : 0 : iv_step = TYPE_SIZE_UNIT (aggr_type);
5559 : 0 : if (tree_int_cst_sgn (step) == -1)
5560 : 0 : iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
5561 : : }
5562 : :
5563 : 116513 : standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5564 : :
5565 : 116513 : create_iv (aggr_ptr_init, PLUS_EXPR,
5566 : : fold_convert (aggr_ptr_type, iv_step),
5567 : : aggr_ptr, loop, &incr_gsi, insert_after,
5568 : : &indx_before_incr, &indx_after_incr);
5569 : 116513 : incr = gsi_stmt (incr_gsi);
5570 : :
5571 : : /* Copy the points-to information if it exists. */
5572 : 116513 : if (DR_PTR_INFO (dr))
5573 : : {
5574 : 62300 : vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5575 : 62300 : vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5576 : : }
5577 : 116513 : if (ptr_incr)
5578 : 116513 : *ptr_incr = incr;
5579 : :
5580 : 116513 : aptr = indx_before_incr;
5581 : : }
5582 : :
5583 : 670847 : if (!nested_in_vect_loop || only_init)
5584 : : return aptr;
5585 : :
5586 : :
5587 : : /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
5588 : : nested in LOOP, if exists. */
5589 : :
5590 : 337 : gcc_assert (nested_in_vect_loop);
5591 : 337 : if (!only_init)
5592 : : {
5593 : 337 : standard_iv_increment_position (containing_loop, &incr_gsi,
5594 : : &insert_after);
5595 : 337 : create_iv (aptr, PLUS_EXPR, fold_convert (aggr_ptr_type, DR_STEP (dr)),
5596 : : aggr_ptr, containing_loop, &incr_gsi, insert_after,
5597 : : &indx_before_incr, &indx_after_incr);
5598 : 337 : incr = gsi_stmt (incr_gsi);
5599 : :
5600 : : /* Copy the points-to information if it exists. */
5601 : 337 : if (DR_PTR_INFO (dr))
5602 : : {
5603 : 74 : vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
5604 : 74 : vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
5605 : : }
5606 : 337 : if (ptr_incr)
5607 : 337 : *ptr_incr = incr;
5608 : :
5609 : 337 : return indx_before_incr;
5610 : : }
5611 : : else
5612 : : gcc_unreachable ();
5613 : : }
5614 : :
5615 : :
5616 : : /* Function bump_vector_ptr
5617 : :
5618 : : Increment a pointer (to a vector type) by vector-size. If requested,
5619 : : i.e. if PTR-INCR is given, then also connect the new increment stmt
5620 : : to the existing def-use update-chain of the pointer, by modifying
5621 : : the PTR_INCR as illustrated below:
5622 : :
5623 : : The pointer def-use update-chain before this function:
5624 : : DATAREF_PTR = phi (p_0, p_2)
5625 : : ....
5626 : : PTR_INCR: p_2 = DATAREF_PTR + step
5627 : :
5628 : : The pointer def-use update-chain after this function:
5629 : : DATAREF_PTR = phi (p_0, p_2)
5630 : : ....
5631 : : NEW_DATAREF_PTR = DATAREF_PTR + BUMP
5632 : : ....
5633 : : PTR_INCR: p_2 = NEW_DATAREF_PTR + step
5634 : :
5635 : : Input:
5636 : : DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
5637 : : in the loop.
5638 : : PTR_INCR - optional. The stmt that updates the pointer in each iteration of
5639 : : the loop. The increment amount across iterations is expected
5640 : : to be vector_size.
5641 : : BSI - location where the new update stmt is to be placed.
5642 : : STMT_INFO - the original scalar memory-access stmt that is being vectorized.
5643 : : BUMP - optional. The offset by which to bump the pointer. If not given,
5644 : : the offset is assumed to be vector_size.
5645 : :
5646 : : Output: Return NEW_DATAREF_PTR as illustrated above.
5647 : :
5648 : : */
5649 : :
5650 : : tree
5651 : 206037 : bump_vector_ptr (vec_info *vinfo,
5652 : : tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
5653 : : stmt_vec_info stmt_info, tree bump)
5654 : : {
5655 : 206037 : struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5656 : 206037 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5657 : 206037 : tree update = TYPE_SIZE_UNIT (vectype);
5658 : 206037 : gimple *incr_stmt;
5659 : 206037 : ssa_op_iter iter;
5660 : 206037 : use_operand_p use_p;
5661 : 206037 : tree new_dataref_ptr;
5662 : :
5663 : 206037 : if (bump)
5664 : 206037 : update = bump;
5665 : :
5666 : 206037 : if (TREE_CODE (dataref_ptr) == SSA_NAME)
5667 : 95937 : new_dataref_ptr = copy_ssa_name (dataref_ptr);
5668 : 110100 : else if (is_gimple_min_invariant (dataref_ptr))
5669 : : /* When possible avoid emitting a separate increment stmt that will
5670 : : force the addressed object addressable. */
5671 : 220200 : return build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
5672 : 110100 : fold_build2 (MEM_REF,
5673 : : TREE_TYPE (TREE_TYPE (dataref_ptr)),
5674 : : dataref_ptr,
5675 : 110100 : fold_convert (ptr_type_node, update)));
5676 : : else
5677 : 0 : new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5678 : 95937 : incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5679 : : dataref_ptr, update);
5680 : 95937 : vect_finish_stmt_generation (vinfo, stmt_info, incr_stmt, gsi);
5681 : : /* Fold the increment, avoiding excessive chains use-def chains of
5682 : : those, leading to compile-time issues for passes until the next
5683 : : forwprop pass which would do this as well. */
5684 : 95937 : gimple_stmt_iterator fold_gsi = gsi_for_stmt (incr_stmt);
5685 : 95937 : if (fold_stmt (&fold_gsi, follow_all_ssa_edges))
5686 : : {
5687 : 63216 : incr_stmt = gsi_stmt (fold_gsi);
5688 : 63216 : update_stmt (incr_stmt);
5689 : : }
5690 : :
5691 : : /* Copy the points-to information if it exists. */
5692 : 95937 : if (DR_PTR_INFO (dr))
5693 : : {
5694 : 53376 : duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5695 : 53376 : mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5696 : : }
5697 : :
5698 : 95937 : if (!ptr_incr)
5699 : : return new_dataref_ptr;
5700 : :
5701 : : /* Update the vector-pointer's cross-iteration increment. */
5702 : 76162 : FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5703 : : {
5704 : 38081 : tree use = USE_FROM_PTR (use_p);
5705 : :
5706 : 38081 : if (use == dataref_ptr)
5707 : 38081 : SET_USE (use_p, new_dataref_ptr);
5708 : : else
5709 : 0 : gcc_assert (operand_equal_p (use, update, 0));
5710 : : }
5711 : :
5712 : : return new_dataref_ptr;
5713 : : }
5714 : :
5715 : :
5716 : : /* Copy memory reference info such as base/clique from the SRC reference
5717 : : to the DEST MEM_REF. */
5718 : :
5719 : : void
5720 : 887892 : vect_copy_ref_info (tree dest, tree src)
5721 : : {
5722 : 887892 : if (TREE_CODE (dest) != MEM_REF)
5723 : : return;
5724 : :
5725 : : tree src_base = src;
5726 : 1938068 : while (handled_component_p (src_base))
5727 : 1050701 : src_base = TREE_OPERAND (src_base, 0);
5728 : 887367 : if (TREE_CODE (src_base) != MEM_REF
5729 : 887367 : && TREE_CODE (src_base) != TARGET_MEM_REF)
5730 : : return;
5731 : :
5732 : 408561 : MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5733 : 408561 : MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5734 : : }
5735 : :
5736 : :
5737 : : /* Function vect_create_destination_var.
5738 : :
5739 : : Create a new temporary of type VECTYPE. */
5740 : :
5741 : : tree
5742 : 459628 : vect_create_destination_var (tree scalar_dest, tree vectype)
5743 : : {
5744 : 459628 : tree vec_dest;
5745 : 459628 : const char *name;
5746 : 459628 : char *new_name;
5747 : 459628 : tree type;
5748 : 459628 : enum vect_var_kind kind;
5749 : :
5750 : 941018 : kind = vectype
5751 : 897494 : ? VECTOR_BOOLEAN_TYPE_P (vectype)
5752 : 437866 : ? vect_mask_var
5753 : : : vect_simple_var
5754 : : : vect_scalar_var;
5755 : 21762 : type = vectype ? vectype : TREE_TYPE (scalar_dest);
5756 : :
5757 : 459628 : gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5758 : :
5759 : 459628 : name = get_name (scalar_dest);
5760 : 459628 : if (name)
5761 : 164856 : new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5762 : : else
5763 : 294772 : new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5764 : 459628 : vec_dest = vect_get_new_vect_var (type, kind, new_name);
5765 : 459628 : free (new_name);
5766 : :
5767 : 459628 : return vec_dest;
5768 : : }
5769 : :
5770 : : /* Function vect_grouped_store_supported.
5771 : :
5772 : : Returns TRUE if interleave high and interleave low permutations
5773 : : are supported, and FALSE otherwise. */
5774 : :
5775 : : bool
5776 : 4035 : vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5777 : : {
5778 : 4035 : machine_mode mode = TYPE_MODE (vectype);
5779 : :
5780 : : /* vect_permute_store_chain requires the group size to be equal to 3 or
5781 : : be a power of two. */
5782 : 4035 : if (count != 3 && exact_log2 (count) == -1)
5783 : : {
5784 : 580 : if (dump_enabled_p ())
5785 : 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5786 : : "the size of the group of accesses"
5787 : : " is not a power of 2 or not eqaul to 3\n");
5788 : 580 : return false;
5789 : : }
5790 : :
5791 : : /* Check that the permutation is supported. */
5792 : 3455 : if (VECTOR_MODE_P (mode))
5793 : : {
5794 : 3455 : unsigned int i;
5795 : 3455 : if (count == 3)
5796 : : {
5797 : 1708 : unsigned int j0 = 0, j1 = 0, j2 = 0;
5798 : 1708 : unsigned int i, j;
5799 : :
5800 : 1708 : unsigned int nelt;
5801 : 3416 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5802 : : {
5803 : : if (dump_enabled_p ())
5804 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5805 : : "cannot handle groups of 3 stores for"
5806 : : " variable-length vectors\n");
5807 : : return false;
5808 : : }
5809 : :
5810 : 1708 : vec_perm_builder sel (nelt, nelt, 1);
5811 : 1708 : sel.quick_grow (nelt);
5812 : 1708 : vec_perm_indices indices;
5813 : 6625 : for (j = 0; j < 3; j++)
5814 : : {
5815 : 4986 : int nelt0 = ((3 - j) * nelt) % 3;
5816 : 4986 : int nelt1 = ((3 - j) * nelt + 1) % 3;
5817 : 4986 : int nelt2 = ((3 - j) * nelt + 2) % 3;
5818 : 16894 : for (i = 0; i < nelt; i++)
5819 : : {
5820 : 11908 : if (3 * i + nelt0 < nelt)
5821 : 4004 : sel[3 * i + nelt0] = j0++;
5822 : 11908 : if (3 * i + nelt1 < nelt)
5823 : 3969 : sel[3 * i + nelt1] = nelt + j1++;
5824 : 11908 : if (3 * i + nelt2 < nelt)
5825 : 3935 : sel[3 * i + nelt2] = 0;
5826 : : }
5827 : 4986 : indices.new_vector (sel, 2, nelt);
5828 : 4986 : if (!can_vec_perm_const_p (mode, mode, indices))
5829 : : {
5830 : 60 : if (dump_enabled_p ())
5831 : 37 : dump_printf (MSG_MISSED_OPTIMIZATION,
5832 : : "permutation op not supported by target.\n");
5833 : 60 : return false;
5834 : : }
5835 : :
5836 : 16314 : for (i = 0; i < nelt; i++)
5837 : : {
5838 : 11388 : if (3 * i + nelt0 < nelt)
5839 : 3802 : sel[3 * i + nelt0] = 3 * i + nelt0;
5840 : 11388 : if (3 * i + nelt1 < nelt)
5841 : 3793 : sel[3 * i + nelt1] = 3 * i + nelt1;
5842 : 11388 : if (3 * i + nelt2 < nelt)
5843 : 3793 : sel[3 * i + nelt2] = nelt + j2++;
5844 : : }
5845 : 4926 : indices.new_vector (sel, 2, nelt);
5846 : 4926 : if (!can_vec_perm_const_p (mode, mode, indices))
5847 : : {
5848 : 9 : if (dump_enabled_p ())
5849 : 9 : dump_printf (MSG_MISSED_OPTIMIZATION,
5850 : : "permutation op not supported by target.\n");
5851 : 9 : return false;
5852 : : }
5853 : : }
5854 : : return true;
5855 : 1708 : }
5856 : : else
5857 : : {
5858 : : /* If length is not equal to 3 then only power of 2 is supported. */
5859 : 1747 : gcc_assert (pow2p_hwi (count));
5860 : 3494 : poly_uint64 nelt = GET_MODE_NUNITS (mode);
5861 : :
5862 : : /* The encoding has 2 interleaved stepped patterns. */
5863 : 3494 : if(!multiple_p (nelt, 2))
5864 : 1681 : return false;
5865 : 1747 : vec_perm_builder sel (nelt, 2, 3);
5866 : 1747 : sel.quick_grow (6);
5867 : 8735 : for (i = 0; i < 3; i++)
5868 : : {
5869 : 5241 : sel[i * 2] = i;
5870 : 5241 : sel[i * 2 + 1] = i + nelt;
5871 : : }
5872 : 1747 : vec_perm_indices indices (sel, 2, nelt);
5873 : 1747 : if (can_vec_perm_const_p (mode, mode, indices))
5874 : : {
5875 : 11767 : for (i = 0; i < 6; i++)
5876 : 10086 : sel[i] += exact_div (nelt, 2);
5877 : 1681 : indices.new_vector (sel, 2, nelt);
5878 : 1681 : if (can_vec_perm_const_p (mode, mode, indices))
5879 : 1681 : return true;
5880 : : }
5881 : 1747 : }
5882 : : }
5883 : :
5884 : 66 : if (dump_enabled_p ())
5885 : 5 : dump_printf (MSG_MISSED_OPTIMIZATION,
5886 : : "permutation op not supported by target.\n");
5887 : : return false;
5888 : : }
5889 : :
5890 : : /* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
5891 : : of type VECTYPE. MASKED_P says whether the masked form is needed. */
5892 : :
5893 : : internal_fn
5894 : 26949 : vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5895 : : bool masked_p)
5896 : : {
5897 : 26949 : if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
5898 : : vec_mask_len_store_lanes_optab, vectype,
5899 : : count))
5900 : : return IFN_MASK_LEN_STORE_LANES;
5901 : 26949 : else if (masked_p)
5902 : : {
5903 : 184 : if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5904 : : vec_mask_store_lanes_optab, vectype,
5905 : : count))
5906 : : return IFN_MASK_STORE_LANES;
5907 : : }
5908 : : else
5909 : : {
5910 : 26765 : if (vect_lanes_optab_supported_p ("vec_store_lanes",
5911 : : vec_store_lanes_optab, vectype, count))
5912 : : return IFN_STORE_LANES;
5913 : : }
5914 : : return IFN_LAST;
5915 : : }
5916 : :
5917 : :
5918 : : /* Function vect_permute_store_chain.
5919 : :
5920 : : Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5921 : : a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5922 : : the data correctly for the stores. Return the final references for stores
5923 : : in RESULT_CHAIN.
5924 : :
5925 : : E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5926 : : The input is 4 vectors each containing 8 elements. We assign a number to
5927 : : each element, the input sequence is:
5928 : :
5929 : : 1st vec: 0 1 2 3 4 5 6 7
5930 : : 2nd vec: 8 9 10 11 12 13 14 15
5931 : : 3rd vec: 16 17 18 19 20 21 22 23
5932 : : 4th vec: 24 25 26 27 28 29 30 31
5933 : :
5934 : : The output sequence should be:
5935 : :
5936 : : 1st vec: 0 8 16 24 1 9 17 25
5937 : : 2nd vec: 2 10 18 26 3 11 19 27
5938 : : 3rd vec: 4 12 20 28 5 13 21 30
5939 : : 4th vec: 6 14 22 30 7 15 23 31
5940 : :
5941 : : i.e., we interleave the contents of the four vectors in their order.
5942 : :
5943 : : We use interleave_high/low instructions to create such output. The input of
5944 : : each interleave_high/low operation is two vectors:
5945 : : 1st vec 2nd vec
5946 : : 0 1 2 3 4 5 6 7
5947 : : the even elements of the result vector are obtained left-to-right from the
5948 : : high/low elements of the first vector. The odd elements of the result are
5949 : : obtained left-to-right from the high/low elements of the second vector.
5950 : : The output of interleave_high will be: 0 4 1 5
5951 : : and of interleave_low: 2 6 3 7
5952 : :
5953 : :
5954 : : The permutation is done in log LENGTH stages. In each stage interleave_high
5955 : : and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5956 : : where the first argument is taken from the first half of DR_CHAIN and the
5957 : : second argument from it's second half.
5958 : : In our example,
5959 : :
5960 : : I1: interleave_high (1st vec, 3rd vec)
5961 : : I2: interleave_low (1st vec, 3rd vec)
5962 : : I3: interleave_high (2nd vec, 4th vec)
5963 : : I4: interleave_low (2nd vec, 4th vec)
5964 : :
5965 : : The output for the first stage is:
5966 : :
5967 : : I1: 0 16 1 17 2 18 3 19
5968 : : I2: 4 20 5 21 6 22 7 23
5969 : : I3: 8 24 9 25 10 26 11 27
5970 : : I4: 12 28 13 29 14 30 15 31
5971 : :
5972 : : The output of the second stage, i.e. the final result is:
5973 : :
5974 : : I1: 0 8 16 24 1 9 17 25
5975 : : I2: 2 10 18 26 3 11 19 27
5976 : : I3: 4 12 20 28 5 13 21 30
5977 : : I4: 6 14 22 30 7 15 23 31. */
5978 : :
5979 : : void
5980 : 0 : vect_permute_store_chain (vec_info *vinfo, vec<tree> &dr_chain,
5981 : : unsigned int length,
5982 : : stmt_vec_info stmt_info,
5983 : : gimple_stmt_iterator *gsi,
5984 : : vec<tree> *result_chain)
5985 : : {
5986 : 0 : tree vect1, vect2, high, low;
5987 : 0 : gimple *perm_stmt;
5988 : 0 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5989 : 0 : tree perm_mask_low, perm_mask_high;
5990 : 0 : tree data_ref;
5991 : 0 : tree perm3_mask_low, perm3_mask_high;
5992 : 0 : unsigned int i, j, n, log_length = exact_log2 (length);
5993 : :
5994 : 0 : result_chain->quick_grow (length);
5995 : 0 : memcpy (result_chain->address (), dr_chain.address (),
5996 : : length * sizeof (tree));
5997 : :
5998 : 0 : if (length == 3)
5999 : : {
6000 : : /* vect_grouped_store_supported ensures that this is constant. */
6001 : 0 : unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6002 : 0 : unsigned int j0 = 0, j1 = 0, j2 = 0;
6003 : :
6004 : 0 : vec_perm_builder sel (nelt, nelt, 1);
6005 : 0 : sel.quick_grow (nelt);
6006 : 0 : vec_perm_indices indices;
6007 : 0 : for (j = 0; j < 3; j++)
6008 : : {
6009 : 0 : int nelt0 = ((3 - j) * nelt) % 3;
6010 : 0 : int nelt1 = ((3 - j) * nelt + 1) % 3;
6011 : 0 : int nelt2 = ((3 - j) * nelt + 2) % 3;
6012 : :
6013 : 0 : for (i = 0; i < nelt; i++)
6014 : : {
6015 : 0 : if (3 * i + nelt0 < nelt)
6016 : 0 : sel[3 * i + nelt0] = j0++;
6017 : 0 : if (3 * i + nelt1 < nelt)
6018 : 0 : sel[3 * i + nelt1] = nelt + j1++;
6019 : 0 : if (3 * i + nelt2 < nelt)
6020 : 0 : sel[3 * i + nelt2] = 0;
6021 : : }
6022 : 0 : indices.new_vector (sel, 2, nelt);
6023 : 0 : perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6024 : :
6025 : 0 : for (i = 0; i < nelt; i++)
6026 : : {
6027 : 0 : if (3 * i + nelt0 < nelt)
6028 : 0 : sel[3 * i + nelt0] = 3 * i + nelt0;
6029 : 0 : if (3 * i + nelt1 < nelt)
6030 : 0 : sel[3 * i + nelt1] = 3 * i + nelt1;
6031 : 0 : if (3 * i + nelt2 < nelt)
6032 : 0 : sel[3 * i + nelt2] = nelt + j2++;
6033 : : }
6034 : 0 : indices.new_vector (sel, 2, nelt);
6035 : 0 : perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6036 : :
6037 : 0 : vect1 = dr_chain[0];
6038 : 0 : vect2 = dr_chain[1];
6039 : :
6040 : : /* Create interleaving stmt:
6041 : : low = VEC_PERM_EXPR <vect1, vect2,
6042 : : {j, nelt, *, j + 1, nelt + j + 1, *,
6043 : : j + 2, nelt + j + 2, *, ...}> */
6044 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6045 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
6046 : : vect2, perm3_mask_low);
6047 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6048 : :
6049 : 0 : vect1 = data_ref;
6050 : 0 : vect2 = dr_chain[2];
6051 : : /* Create interleaving stmt:
6052 : : low = VEC_PERM_EXPR <vect1, vect2,
6053 : : {0, 1, nelt + j, 3, 4, nelt + j + 1,
6054 : : 6, 7, nelt + j + 2, ...}> */
6055 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6056 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
6057 : : vect2, perm3_mask_high);
6058 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6059 : 0 : (*result_chain)[j] = data_ref;
6060 : : }
6061 : 0 : }
6062 : : else
6063 : : {
6064 : : /* If length is not equal to 3 then only power of 2 is supported. */
6065 : 0 : gcc_assert (pow2p_hwi (length));
6066 : :
6067 : : /* The encoding has 2 interleaved stepped patterns. */
6068 : 0 : poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6069 : 0 : vec_perm_builder sel (nelt, 2, 3);
6070 : 0 : sel.quick_grow (6);
6071 : 0 : for (i = 0; i < 3; i++)
6072 : : {
6073 : 0 : sel[i * 2] = i;
6074 : 0 : sel[i * 2 + 1] = i + nelt;
6075 : : }
6076 : 0 : vec_perm_indices indices (sel, 2, nelt);
6077 : 0 : perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6078 : :
6079 : 0 : for (i = 0; i < 6; i++)
6080 : 0 : sel[i] += exact_div (nelt, 2);
6081 : 0 : indices.new_vector (sel, 2, nelt);
6082 : 0 : perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6083 : :
6084 : 0 : for (i = 0, n = log_length; i < n; i++)
6085 : : {
6086 : 0 : for (j = 0; j < length/2; j++)
6087 : : {
6088 : 0 : vect1 = dr_chain[j];
6089 : 0 : vect2 = dr_chain[j+length/2];
6090 : :
6091 : : /* Create interleaving stmt:
6092 : : high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
6093 : : ...}> */
6094 : 0 : high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
6095 : 0 : perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
6096 : : vect2, perm_mask_high);
6097 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6098 : 0 : (*result_chain)[2*j] = high;
6099 : :
6100 : : /* Create interleaving stmt:
6101 : : low = VEC_PERM_EXPR <vect1, vect2,
6102 : : {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
6103 : : ...}> */
6104 : 0 : low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
6105 : 0 : perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
6106 : : vect2, perm_mask_low);
6107 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6108 : 0 : (*result_chain)[2*j+1] = low;
6109 : : }
6110 : 0 : memcpy (dr_chain.address (), result_chain->address (),
6111 : : length * sizeof (tree));
6112 : : }
6113 : 0 : }
6114 : 0 : }
6115 : :
6116 : : /* Function vect_setup_realignment
6117 : :
6118 : : This function is called when vectorizing an unaligned load using
6119 : : the dr_explicit_realign[_optimized] scheme.
6120 : : This function generates the following code at the loop prolog:
6121 : :
6122 : : p = initial_addr;
6123 : : x msq_init = *(floor(p)); # prolog load
6124 : : realignment_token = call target_builtin;
6125 : : loop:
6126 : : x msq = phi (msq_init, ---)
6127 : :
6128 : : The stmts marked with x are generated only for the case of
6129 : : dr_explicit_realign_optimized.
6130 : :
6131 : : The code above sets up a new (vector) pointer, pointing to the first
6132 : : location accessed by STMT_INFO, and a "floor-aligned" load using that
6133 : : pointer. It also generates code to compute the "realignment-token"
6134 : : (if the relevant target hook was defined), and creates a phi-node at the
6135 : : loop-header bb whose arguments are the result of the prolog-load (created
6136 : : by this function) and the result of a load that takes place in the loop
6137 : : (to be created by the caller to this function).
6138 : :
6139 : : For the case of dr_explicit_realign_optimized:
6140 : : The caller to this function uses the phi-result (msq) to create the
6141 : : realignment code inside the loop, and sets up the missing phi argument,
6142 : : as follows:
6143 : : loop:
6144 : : msq = phi (msq_init, lsq)
6145 : : lsq = *(floor(p')); # load in loop
6146 : : result = realign_load (msq, lsq, realignment_token);
6147 : :
6148 : : For the case of dr_explicit_realign:
6149 : : loop:
6150 : : msq = *(floor(p)); # load in loop
6151 : : p' = p + (VS-1);
6152 : : lsq = *(floor(p')); # load in loop
6153 : : result = realign_load (msq, lsq, realignment_token);
6154 : :
6155 : : Input:
6156 : : STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
6157 : : a memory location that may be unaligned.
6158 : : BSI - place where new code is to be inserted.
6159 : : ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
6160 : : is used.
6161 : :
6162 : : Output:
6163 : : REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
6164 : : target hook, if defined.
6165 : : Return value - the result of the loop-header phi node. */
6166 : :
6167 : : tree
6168 : 0 : vect_setup_realignment (vec_info *vinfo, stmt_vec_info stmt_info,
6169 : : gimple_stmt_iterator *gsi, tree *realignment_token,
6170 : : enum dr_alignment_support alignment_support_scheme,
6171 : : tree init_addr,
6172 : : class loop **at_loop)
6173 : : {
6174 : 0 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6175 : 0 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6176 : 0 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
6177 : 0 : struct data_reference *dr = dr_info->dr;
6178 : 0 : class loop *loop = NULL;
6179 : 0 : edge pe = NULL;
6180 : 0 : tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
6181 : 0 : tree vec_dest;
6182 : 0 : gimple *inc;
6183 : 0 : tree ptr;
6184 : 0 : tree data_ref;
6185 : 0 : basic_block new_bb;
6186 : 0 : tree msq_init = NULL_TREE;
6187 : 0 : tree new_temp;
6188 : 0 : gphi *phi_stmt;
6189 : 0 : tree msq = NULL_TREE;
6190 : 0 : gimple_seq stmts = NULL;
6191 : 0 : bool compute_in_loop = false;
6192 : 0 : bool nested_in_vect_loop = false;
6193 : 0 : class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
6194 : 0 : class loop *loop_for_initial_load = NULL;
6195 : :
6196 : 0 : if (loop_vinfo)
6197 : : {
6198 : 0 : loop = LOOP_VINFO_LOOP (loop_vinfo);
6199 : 0 : nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
6200 : : }
6201 : :
6202 : 0 : gcc_assert (alignment_support_scheme == dr_explicit_realign
6203 : : || alignment_support_scheme == dr_explicit_realign_optimized);
6204 : :
6205 : : /* We need to generate three things:
6206 : : 1. the misalignment computation
6207 : : 2. the extra vector load (for the optimized realignment scheme).
6208 : : 3. the phi node for the two vectors from which the realignment is
6209 : : done (for the optimized realignment scheme). */
6210 : :
6211 : : /* 1. Determine where to generate the misalignment computation.
6212 : :
6213 : : If INIT_ADDR is NULL_TREE, this indicates that the misalignment
6214 : : calculation will be generated by this function, outside the loop (in the
6215 : : preheader). Otherwise, INIT_ADDR had already been computed for us by the
6216 : : caller, inside the loop.
6217 : :
6218 : : Background: If the misalignment remains fixed throughout the iterations of
6219 : : the loop, then both realignment schemes are applicable, and also the
6220 : : misalignment computation can be done outside LOOP. This is because we are
6221 : : vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
6222 : : are a multiple of VS (the Vector Size), and therefore the misalignment in
6223 : : different vectorized LOOP iterations is always the same.
6224 : : The problem arises only if the memory access is in an inner-loop nested
6225 : : inside LOOP, which is now being vectorized using outer-loop vectorization.
6226 : : This is the only case when the misalignment of the memory access may not
6227 : : remain fixed throughout the iterations of the inner-loop (as explained in
6228 : : detail in vect_supportable_dr_alignment). In this case, not only is the
6229 : : optimized realignment scheme not applicable, but also the misalignment
6230 : : computation (and generation of the realignment token that is passed to
6231 : : REALIGN_LOAD) have to be done inside the loop.
6232 : :
6233 : : In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
6234 : : or not, which in turn determines if the misalignment is computed inside
6235 : : the inner-loop, or outside LOOP. */
6236 : :
6237 : 0 : if (init_addr != NULL_TREE || !loop_vinfo)
6238 : : {
6239 : 0 : compute_in_loop = true;
6240 : 0 : gcc_assert (alignment_support_scheme == dr_explicit_realign);
6241 : : }
6242 : :
6243 : :
6244 : : /* 2. Determine where to generate the extra vector load.
6245 : :
6246 : : For the optimized realignment scheme, instead of generating two vector
6247 : : loads in each iteration, we generate a single extra vector load in the
6248 : : preheader of the loop, and in each iteration reuse the result of the
6249 : : vector load from the previous iteration. In case the memory access is in
6250 : : an inner-loop nested inside LOOP, which is now being vectorized using
6251 : : outer-loop vectorization, we need to determine whether this initial vector
6252 : : load should be generated at the preheader of the inner-loop, or can be
6253 : : generated at the preheader of LOOP. If the memory access has no evolution
6254 : : in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
6255 : : to be generated inside LOOP (in the preheader of the inner-loop). */
6256 : :
6257 : 0 : if (nested_in_vect_loop)
6258 : : {
6259 : 0 : tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
6260 : 0 : bool invariant_in_outerloop =
6261 : 0 : (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
6262 : 0 : loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
6263 : : }
6264 : : else
6265 : : loop_for_initial_load = loop;
6266 : 0 : if (at_loop)
6267 : 0 : *at_loop = loop_for_initial_load;
6268 : :
6269 : 0 : tree vuse = NULL_TREE;
6270 : 0 : if (loop_for_initial_load)
6271 : : {
6272 : 0 : pe = loop_preheader_edge (loop_for_initial_load);
6273 : 0 : if (gphi *vphi = get_virtual_phi (loop_for_initial_load->header))
6274 : 0 : vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
6275 : : }
6276 : 0 : if (!vuse)
6277 : 0 : vuse = gimple_vuse (gsi_stmt (*gsi));
6278 : :
6279 : : /* 3. For the case of the optimized realignment, create the first vector
6280 : : load at the loop preheader. */
6281 : :
6282 : 0 : if (alignment_support_scheme == dr_explicit_realign_optimized)
6283 : : {
6284 : : /* Create msq_init = *(floor(p1)) in the loop preheader */
6285 : 0 : gassign *new_stmt;
6286 : :
6287 : 0 : gcc_assert (!compute_in_loop);
6288 : 0 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
6289 : 0 : ptr = vect_create_data_ref_ptr (vinfo, stmt_info, vectype,
6290 : : loop_for_initial_load, NULL_TREE,
6291 : : &init_addr, NULL, &inc, true);
6292 : 0 : if (TREE_CODE (ptr) == SSA_NAME)
6293 : 0 : new_temp = copy_ssa_name (ptr);
6294 : : else
6295 : 0 : new_temp = make_ssa_name (TREE_TYPE (ptr));
6296 : 0 : poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
6297 : 0 : tree type = TREE_TYPE (ptr);
6298 : 0 : new_stmt = gimple_build_assign
6299 : 0 : (new_temp, BIT_AND_EXPR, ptr,
6300 : 0 : fold_build2 (MINUS_EXPR, type,
6301 : : build_int_cst (type, 0),
6302 : : build_int_cst (type, align)));
6303 : 0 : new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6304 : 0 : gcc_assert (!new_bb);
6305 : 0 : data_ref
6306 : 0 : = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
6307 : 0 : build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
6308 : 0 : vect_copy_ref_info (data_ref, DR_REF (dr));
6309 : 0 : new_stmt = gimple_build_assign (vec_dest, data_ref);
6310 : 0 : new_temp = make_ssa_name (vec_dest, new_stmt);
6311 : 0 : gimple_assign_set_lhs (new_stmt, new_temp);
6312 : 0 : gimple_set_vuse (new_stmt, vuse);
6313 : 0 : if (pe)
6314 : : {
6315 : 0 : new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6316 : 0 : gcc_assert (!new_bb);
6317 : : }
6318 : : else
6319 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6320 : :
6321 : 0 : msq_init = gimple_assign_lhs (new_stmt);
6322 : : }
6323 : :
6324 : : /* 4. Create realignment token using a target builtin, if available.
6325 : : It is done either inside the containing loop, or before LOOP (as
6326 : : determined above). */
6327 : :
6328 : 0 : if (targetm.vectorize.builtin_mask_for_load)
6329 : : {
6330 : 0 : gcall *new_stmt;
6331 : 0 : tree builtin_decl;
6332 : :
6333 : : /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
6334 : 0 : if (!init_addr)
6335 : : {
6336 : : /* Generate the INIT_ADDR computation outside LOOP. */
6337 : 0 : init_addr = vect_create_addr_base_for_vector_ref (vinfo,
6338 : : stmt_info, &stmts,
6339 : : NULL_TREE);
6340 : 0 : if (loop)
6341 : : {
6342 : 0 : pe = loop_preheader_edge (loop);
6343 : 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6344 : 0 : gcc_assert (!new_bb);
6345 : : }
6346 : : else
6347 : 0 : gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
6348 : : }
6349 : :
6350 : 0 : builtin_decl = targetm.vectorize.builtin_mask_for_load ();
6351 : 0 : new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
6352 : 0 : vec_dest =
6353 : 0 : vect_create_destination_var (scalar_dest,
6354 : : gimple_call_return_type (new_stmt));
6355 : 0 : new_temp = make_ssa_name (vec_dest, new_stmt);
6356 : 0 : gimple_call_set_lhs (new_stmt, new_temp);
6357 : :
6358 : 0 : if (compute_in_loop)
6359 : 0 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6360 : : else
6361 : : {
6362 : : /* Generate the misalignment computation outside LOOP. */
6363 : 0 : pe = loop_preheader_edge (loop);
6364 : 0 : new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
6365 : 0 : gcc_assert (!new_bb);
6366 : : }
6367 : :
6368 : 0 : *realignment_token = gimple_call_lhs (new_stmt);
6369 : :
6370 : : /* The result of the CALL_EXPR to this builtin is determined from
6371 : : the value of the parameter and no global variables are touched
6372 : : which makes the builtin a "const" function. Requiring the
6373 : : builtin to have the "const" attribute makes it unnecessary
6374 : : to call mark_call_clobbered. */
6375 : 0 : gcc_assert (TREE_READONLY (builtin_decl));
6376 : : }
6377 : :
6378 : 0 : if (alignment_support_scheme == dr_explicit_realign)
6379 : : return msq;
6380 : :
6381 : 0 : gcc_assert (!compute_in_loop);
6382 : 0 : gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
6383 : :
6384 : :
6385 : : /* 5. Create msq = phi <msq_init, lsq> in loop */
6386 : :
6387 : 0 : pe = loop_preheader_edge (containing_loop);
6388 : 0 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
6389 : 0 : msq = make_ssa_name (vec_dest);
6390 : 0 : phi_stmt = create_phi_node (msq, containing_loop->header);
6391 : 0 : add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
6392 : :
6393 : 0 : return msq;
6394 : : }
6395 : :
6396 : :
6397 : : /* Function vect_grouped_load_supported.
6398 : :
6399 : : COUNT is the size of the load group (the number of statements plus the
6400 : : number of gaps). SINGLE_ELEMENT_P is true if there is actually
6401 : : only one statement, with a gap of COUNT - 1.
6402 : :
6403 : : Returns true if a suitable permute exists. */
6404 : :
6405 : : bool
6406 : 3151 : vect_grouped_load_supported (tree vectype, bool single_element_p,
6407 : : unsigned HOST_WIDE_INT count)
6408 : : {
6409 : 3151 : machine_mode mode = TYPE_MODE (vectype);
6410 : :
6411 : : /* If this is single-element interleaving with an element distance
6412 : : that leaves unused vector loads around punt - we at least create
6413 : : very sub-optimal code in that case (and blow up memory,
6414 : : see PR65518). */
6415 : 3151 : if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
6416 : : {
6417 : 69 : if (dump_enabled_p ())
6418 : 13 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6419 : : "single-element interleaving not supported "
6420 : : "for not adjacent vector loads\n");
6421 : 69 : return false;
6422 : : }
6423 : :
6424 : : /* vect_permute_load_chain requires the group size to be equal to 3 or
6425 : : be a power of two. */
6426 : 3082 : if (count != 3 && exact_log2 (count) == -1)
6427 : : {
6428 : 441 : if (dump_enabled_p ())
6429 : 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6430 : : "the size of the group of accesses"
6431 : : " is not a power of 2 or not equal to 3\n");
6432 : 441 : return false;
6433 : : }
6434 : :
6435 : : /* Check that the permutation is supported. */
6436 : 2641 : if (VECTOR_MODE_P (mode))
6437 : : {
6438 : 2641 : unsigned int i, j;
6439 : 2641 : if (count == 3)
6440 : : {
6441 : 1392 : unsigned int nelt;
6442 : 2784 : if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
6443 : : {
6444 : : if (dump_enabled_p ())
6445 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6446 : : "cannot handle groups of 3 loads for"
6447 : : " variable-length vectors\n");
6448 : : return false;
6449 : : }
6450 : :
6451 : 1392 : vec_perm_builder sel (nelt, nelt, 1);
6452 : 1392 : sel.quick_grow (nelt);
6453 : 1392 : vec_perm_indices indices;
6454 : 1392 : unsigned int k;
6455 : 5532 : for (k = 0; k < 3; k++)
6456 : : {
6457 : 14397 : for (i = 0; i < nelt; i++)
6458 : 10245 : if (3 * i + k < 2 * nelt)
6459 : 6835 : sel[i] = 3 * i + k;
6460 : : else
6461 : 3410 : sel[i] = 0;
6462 : 4152 : indices.new_vector (sel, 2, nelt);
6463 : 4152 : if (!can_vec_perm_const_p (mode, mode, indices))
6464 : : {
6465 : 12 : if (dump_enabled_p ())
6466 : 3 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6467 : : "shuffle of 3 loads is not supported by"
6468 : : " target\n");
6469 : 12 : return false;
6470 : : }
6471 : 14268 : for (i = 0, j = 0; i < nelt; i++)
6472 : 10128 : if (3 * i + k < 2 * nelt)
6473 : 6752 : sel[i] = i;
6474 : : else
6475 : 3376 : sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6476 : 4140 : indices.new_vector (sel, 2, nelt);
6477 : 4140 : if (!can_vec_perm_const_p (mode, mode, indices))
6478 : : {
6479 : 0 : if (dump_enabled_p ())
6480 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6481 : : "shuffle of 3 loads is not supported by"
6482 : : " target\n");
6483 : 0 : return false;
6484 : : }
6485 : : }
6486 : : return true;
6487 : 1392 : }
6488 : : else
6489 : : {
6490 : : /* If length is not equal to 3 then only power of 2 is supported. */
6491 : 1249 : gcc_assert (pow2p_hwi (count));
6492 : 2498 : poly_uint64 nelt = GET_MODE_NUNITS (mode);
6493 : :
6494 : : /* The encoding has a single stepped pattern. */
6495 : 1249 : vec_perm_builder sel (nelt, 1, 3);
6496 : 1249 : sel.quick_grow (3);
6497 : 6245 : for (i = 0; i < 3; i++)
6498 : 3747 : sel[i] = i * 2;
6499 : 1249 : vec_perm_indices indices (sel, 2, nelt);
6500 : 1249 : if (can_vec_perm_const_p (mode, mode, indices))
6501 : : {
6502 : 4976 : for (i = 0; i < 3; i++)
6503 : 3732 : sel[i] = i * 2 + 1;
6504 : 1244 : indices.new_vector (sel, 2, nelt);
6505 : 1244 : if (can_vec_perm_const_p (mode, mode, indices))
6506 : 1244 : return true;
6507 : : }
6508 : 1249 : }
6509 : : }
6510 : :
6511 : 5 : if (dump_enabled_p ())
6512 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6513 : : "extract even/odd not supported by target\n");
6514 : : return false;
6515 : : }
6516 : :
6517 : : /* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
6518 : : of type VECTYPE. MASKED_P says whether the masked form is needed.
6519 : : If it is available and ELSVALS is nonzero store the possible else values
6520 : : in the vector it points to. */
6521 : :
6522 : : internal_fn
6523 : 88326 : vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
6524 : : bool masked_p, vec<int> *elsvals)
6525 : : {
6526 : 88326 : if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
6527 : : vec_mask_len_load_lanes_optab, vectype,
6528 : : count, elsvals))
6529 : : return IFN_MASK_LEN_LOAD_LANES;
6530 : 88326 : else if (masked_p)
6531 : : {
6532 : 0 : if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
6533 : : vec_mask_load_lanes_optab, vectype,
6534 : : count, elsvals))
6535 : : return IFN_MASK_LOAD_LANES;
6536 : : }
6537 : : else
6538 : : {
6539 : 88326 : if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
6540 : : vectype, count, elsvals))
6541 : : return IFN_LOAD_LANES;
6542 : : }
6543 : : return IFN_LAST;
6544 : : }
6545 : :
6546 : : /* Function vect_permute_load_chain.
6547 : :
6548 : : Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
6549 : : a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
6550 : : the input data correctly. Return the final references for loads in
6551 : : RESULT_CHAIN.
6552 : :
6553 : : E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
6554 : : The input is 4 vectors each containing 8 elements. We assign a number to each
6555 : : element, the input sequence is:
6556 : :
6557 : : 1st vec: 0 1 2 3 4 5 6 7
6558 : : 2nd vec: 8 9 10 11 12 13 14 15
6559 : : 3rd vec: 16 17 18 19 20 21 22 23
6560 : : 4th vec: 24 25 26 27 28 29 30 31
6561 : :
6562 : : The output sequence should be:
6563 : :
6564 : : 1st vec: 0 4 8 12 16 20 24 28
6565 : : 2nd vec: 1 5 9 13 17 21 25 29
6566 : : 3rd vec: 2 6 10 14 18 22 26 30
6567 : : 4th vec: 3 7 11 15 19 23 27 31
6568 : :
6569 : : i.e., the first output vector should contain the first elements of each
6570 : : interleaving group, etc.
6571 : :
6572 : : We use extract_even/odd instructions to create such output. The input of
6573 : : each extract_even/odd operation is two vectors
6574 : : 1st vec 2nd vec
6575 : : 0 1 2 3 4 5 6 7
6576 : :
6577 : : and the output is the vector of extracted even/odd elements. The output of
6578 : : extract_even will be: 0 2 4 6
6579 : : and of extract_odd: 1 3 5 7
6580 : :
6581 : :
6582 : : The permutation is done in log LENGTH stages. In each stage extract_even
6583 : : and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
6584 : : their order. In our example,
6585 : :
6586 : : E1: extract_even (1st vec, 2nd vec)
6587 : : E2: extract_odd (1st vec, 2nd vec)
6588 : : E3: extract_even (3rd vec, 4th vec)
6589 : : E4: extract_odd (3rd vec, 4th vec)
6590 : :
6591 : : The output for the first stage will be:
6592 : :
6593 : : E1: 0 2 4 6 8 10 12 14
6594 : : E2: 1 3 5 7 9 11 13 15
6595 : : E3: 16 18 20 22 24 26 28 30
6596 : : E4: 17 19 21 23 25 27 29 31
6597 : :
6598 : : In order to proceed and create the correct sequence for the next stage (or
6599 : : for the correct output, if the second stage is the last one, as in our
6600 : : example), we first put the output of extract_even operation and then the
6601 : : output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
6602 : : The input for the second stage is:
6603 : :
6604 : : 1st vec (E1): 0 2 4 6 8 10 12 14
6605 : : 2nd vec (E3): 16 18 20 22 24 26 28 30
6606 : : 3rd vec (E2): 1 3 5 7 9 11 13 15
6607 : : 4th vec (E4): 17 19 21 23 25 27 29 31
6608 : :
6609 : : The output of the second stage:
6610 : :
6611 : : E1: 0 4 8 12 16 20 24 28
6612 : : E2: 2 6 10 14 18 22 26 30
6613 : : E3: 1 5 9 13 17 21 25 29
6614 : : E4: 3 7 11 15 19 23 27 31
6615 : :
6616 : : And RESULT_CHAIN after reordering:
6617 : :
6618 : : 1st vec (E1): 0 4 8 12 16 20 24 28
6619 : : 2nd vec (E3): 1 5 9 13 17 21 25 29
6620 : : 3rd vec (E2): 2 6 10 14 18 22 26 30
6621 : : 4th vec (E4): 3 7 11 15 19 23 27 31. */
6622 : :
6623 : : static void
6624 : 0 : vect_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6625 : : unsigned int length,
6626 : : stmt_vec_info stmt_info,
6627 : : gimple_stmt_iterator *gsi,
6628 : : vec<tree> *result_chain)
6629 : : {
6630 : 0 : tree data_ref, first_vect, second_vect;
6631 : 0 : tree perm_mask_even, perm_mask_odd;
6632 : 0 : tree perm3_mask_low, perm3_mask_high;
6633 : 0 : gimple *perm_stmt;
6634 : 0 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6635 : 0 : unsigned int i, j, log_length = exact_log2 (length);
6636 : :
6637 : 0 : result_chain->quick_grow (length);
6638 : 0 : memcpy (result_chain->address (), dr_chain.address (),
6639 : : length * sizeof (tree));
6640 : :
6641 : 0 : if (length == 3)
6642 : : {
6643 : : /* vect_grouped_load_supported ensures that this is constant. */
6644 : 0 : unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6645 : 0 : unsigned int k;
6646 : :
6647 : 0 : vec_perm_builder sel (nelt, nelt, 1);
6648 : 0 : sel.quick_grow (nelt);
6649 : 0 : vec_perm_indices indices;
6650 : 0 : for (k = 0; k < 3; k++)
6651 : : {
6652 : 0 : for (i = 0; i < nelt; i++)
6653 : 0 : if (3 * i + k < 2 * nelt)
6654 : 0 : sel[i] = 3 * i + k;
6655 : : else
6656 : 0 : sel[i] = 0;
6657 : 0 : indices.new_vector (sel, 2, nelt);
6658 : 0 : perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
6659 : :
6660 : 0 : for (i = 0, j = 0; i < nelt; i++)
6661 : 0 : if (3 * i + k < 2 * nelt)
6662 : 0 : sel[i] = i;
6663 : : else
6664 : 0 : sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
6665 : 0 : indices.new_vector (sel, 2, nelt);
6666 : 0 : perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
6667 : :
6668 : 0 : first_vect = dr_chain[0];
6669 : 0 : second_vect = dr_chain[1];
6670 : :
6671 : : /* Create interleaving stmt (low part of):
6672 : : low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6673 : : ...}> */
6674 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
6675 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6676 : : second_vect, perm3_mask_low);
6677 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6678 : :
6679 : : /* Create interleaving stmt (high part of):
6680 : : high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
6681 : : ...}> */
6682 : 0 : first_vect = data_ref;
6683 : 0 : second_vect = dr_chain[2];
6684 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
6685 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
6686 : : second_vect, perm3_mask_high);
6687 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6688 : 0 : (*result_chain)[k] = data_ref;
6689 : : }
6690 : 0 : }
6691 : : else
6692 : : {
6693 : : /* If length is not equal to 3 then only power of 2 is supported. */
6694 : 0 : gcc_assert (pow2p_hwi (length));
6695 : :
6696 : : /* The encoding has a single stepped pattern. */
6697 : 0 : poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
6698 : 0 : vec_perm_builder sel (nelt, 1, 3);
6699 : 0 : sel.quick_grow (3);
6700 : 0 : for (i = 0; i < 3; ++i)
6701 : 0 : sel[i] = i * 2;
6702 : 0 : vec_perm_indices indices (sel, 2, nelt);
6703 : 0 : perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
6704 : :
6705 : 0 : for (i = 0; i < 3; ++i)
6706 : 0 : sel[i] = i * 2 + 1;
6707 : 0 : indices.new_vector (sel, 2, nelt);
6708 : 0 : perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
6709 : :
6710 : 0 : for (i = 0; i < log_length; i++)
6711 : : {
6712 : 0 : for (j = 0; j < length; j += 2)
6713 : : {
6714 : 0 : first_vect = dr_chain[j];
6715 : 0 : second_vect = dr_chain[j+1];
6716 : :
6717 : : /* data_ref = permute_even (first_data_ref, second_data_ref); */
6718 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6719 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6720 : : first_vect, second_vect,
6721 : : perm_mask_even);
6722 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6723 : 0 : (*result_chain)[j/2] = data_ref;
6724 : :
6725 : : /* data_ref = permute_odd (first_data_ref, second_data_ref); */
6726 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6727 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6728 : : first_vect, second_vect,
6729 : : perm_mask_odd);
6730 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6731 : 0 : (*result_chain)[j/2+length/2] = data_ref;
6732 : : }
6733 : 0 : memcpy (dr_chain.address (), result_chain->address (),
6734 : : length * sizeof (tree));
6735 : : }
6736 : 0 : }
6737 : 0 : }
6738 : :
6739 : : /* Function vect_shift_permute_load_chain.
6740 : :
6741 : : Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6742 : : sequence of stmts to reorder the input data accordingly.
6743 : : Return the final references for loads in RESULT_CHAIN.
6744 : : Return true if successed, false otherwise.
6745 : :
6746 : : E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6747 : : The input is 3 vectors each containing 8 elements. We assign a
6748 : : number to each element, the input sequence is:
6749 : :
6750 : : 1st vec: 0 1 2 3 4 5 6 7
6751 : : 2nd vec: 8 9 10 11 12 13 14 15
6752 : : 3rd vec: 16 17 18 19 20 21 22 23
6753 : :
6754 : : The output sequence should be:
6755 : :
6756 : : 1st vec: 0 3 6 9 12 15 18 21
6757 : : 2nd vec: 1 4 7 10 13 16 19 22
6758 : : 3rd vec: 2 5 8 11 14 17 20 23
6759 : :
6760 : : We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6761 : :
6762 : : First we shuffle all 3 vectors to get correct elements order:
6763 : :
6764 : : 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
6765 : : 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
6766 : : 3rd vec: (16 19 22) (17 20 23) (18 21)
6767 : :
6768 : : Next we unite and shift vector 3 times:
6769 : :
6770 : : 1st step:
6771 : : shift right by 6 the concatenation of:
6772 : : "1st vec" and "2nd vec"
6773 : : ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6774 : : "2nd vec" and "3rd vec"
6775 : : ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6776 : : "3rd vec" and "1st vec"
6777 : : (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
6778 : : | New vectors |
6779 : :
6780 : : So that now new vectors are:
6781 : :
6782 : : 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
6783 : : 2nd vec: (10 13) (16 19 22) (17 20 23)
6784 : : 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
6785 : :
6786 : : 2nd step:
6787 : : shift right by 5 the concatenation of:
6788 : : "1st vec" and "3rd vec"
6789 : : ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
6790 : : "2nd vec" and "1st vec"
6791 : : (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
6792 : : "3rd vec" and "2nd vec"
6793 : : (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
6794 : : | New vectors |
6795 : :
6796 : : So that now new vectors are:
6797 : :
6798 : : 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
6799 : : 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
6800 : : 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
6801 : :
6802 : : 3rd step:
6803 : : shift right by 5 the concatenation of:
6804 : : "1st vec" and "1st vec"
6805 : : ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
6806 : : shift right by 3 the concatenation of:
6807 : : "2nd vec" and "2nd vec"
6808 : : (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
6809 : : | New vectors |
6810 : :
6811 : : So that now all vectors are READY:
6812 : : 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
6813 : : 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
6814 : : 3rd vec: ( 1 4 7) (10 13) (16 19 22)
6815 : :
6816 : : This algorithm is faster than one in vect_permute_load_chain if:
6817 : : 1. "shift of a concatination" is faster than general permutation.
6818 : : This is usually so.
6819 : : 2. The TARGET machine can't execute vector instructions in parallel.
6820 : : This is because each step of the algorithm depends on previous.
6821 : : The algorithm in vect_permute_load_chain is much more parallel.
6822 : :
6823 : : The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6824 : : */
6825 : :
6826 : : static bool
6827 : 0 : vect_shift_permute_load_chain (vec_info *vinfo, vec<tree> dr_chain,
6828 : : unsigned int length,
6829 : : stmt_vec_info stmt_info,
6830 : : gimple_stmt_iterator *gsi,
6831 : : vec<tree> *result_chain)
6832 : : {
6833 : 0 : tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6834 : 0 : tree perm2_mask1, perm2_mask2, perm3_mask;
6835 : 0 : tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6836 : 0 : gimple *perm_stmt;
6837 : :
6838 : 0 : tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6839 : 0 : machine_mode vmode = TYPE_MODE (vectype);
6840 : 0 : unsigned int i;
6841 : 0 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6842 : :
6843 : 0 : unsigned HOST_WIDE_INT nelt, vf;
6844 : 0 : if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6845 : 0 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6846 : : /* Not supported for variable-length vectors. */
6847 : : return false;
6848 : :
6849 : 0 : vec_perm_builder sel (nelt, nelt, 1);
6850 : 0 : sel.quick_grow (nelt);
6851 : :
6852 : 0 : result_chain->quick_grow (length);
6853 : 0 : memcpy (result_chain->address (), dr_chain.address (),
6854 : 0 : length * sizeof (tree));
6855 : :
6856 : 0 : if (pow2p_hwi (length) && vf > 4)
6857 : : {
6858 : 0 : unsigned int j, log_length = exact_log2 (length);
6859 : 0 : for (i = 0; i < nelt / 2; ++i)
6860 : 0 : sel[i] = i * 2;
6861 : 0 : for (i = 0; i < nelt / 2; ++i)
6862 : 0 : sel[nelt / 2 + i] = i * 2 + 1;
6863 : 0 : vec_perm_indices indices (sel, 2, nelt);
6864 : 0 : if (!can_vec_perm_const_p (vmode, vmode, indices))
6865 : : {
6866 : 0 : if (dump_enabled_p ())
6867 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6868 : : "shuffle of 2 fields structure is not \
6869 : : supported by target\n");
6870 : 0 : return false;
6871 : : }
6872 : 0 : perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6873 : :
6874 : 0 : for (i = 0; i < nelt / 2; ++i)
6875 : 0 : sel[i] = i * 2 + 1;
6876 : 0 : for (i = 0; i < nelt / 2; ++i)
6877 : 0 : sel[nelt / 2 + i] = i * 2;
6878 : 0 : indices.new_vector (sel, 2, nelt);
6879 : 0 : if (!can_vec_perm_const_p (vmode, vmode, indices))
6880 : : {
6881 : 0 : if (dump_enabled_p ())
6882 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6883 : : "shuffle of 2 fields structure is not \
6884 : : supported by target\n");
6885 : 0 : return false;
6886 : : }
6887 : 0 : perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6888 : :
6889 : : /* Generating permutation constant to shift all elements.
6890 : : For vector length 8 it is {4 5 6 7 8 9 10 11}. */
6891 : 0 : for (i = 0; i < nelt; i++)
6892 : 0 : sel[i] = nelt / 2 + i;
6893 : 0 : indices.new_vector (sel, 2, nelt);
6894 : 0 : if (!can_vec_perm_const_p (vmode, vmode, indices))
6895 : : {
6896 : 0 : if (dump_enabled_p ())
6897 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6898 : : "shift permutation is not supported by target\n");
6899 : 0 : return false;
6900 : : }
6901 : 0 : shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6902 : :
6903 : : /* Generating permutation constant to select vector from 2.
6904 : : For vector length 8 it is {0 1 2 3 12 13 14 15}. */
6905 : 0 : for (i = 0; i < nelt / 2; i++)
6906 : 0 : sel[i] = i;
6907 : 0 : for (i = nelt / 2; i < nelt; i++)
6908 : 0 : sel[i] = nelt + i;
6909 : 0 : indices.new_vector (sel, 2, nelt);
6910 : 0 : if (!can_vec_perm_const_p (vmode, vmode, indices))
6911 : : {
6912 : 0 : if (dump_enabled_p ())
6913 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6914 : : "select is not supported by target\n");
6915 : 0 : return false;
6916 : : }
6917 : 0 : select_mask = vect_gen_perm_mask_checked (vectype, indices);
6918 : :
6919 : 0 : for (i = 0; i < log_length; i++)
6920 : : {
6921 : 0 : for (j = 0; j < length; j += 2)
6922 : : {
6923 : 0 : first_vect = dr_chain[j];
6924 : 0 : second_vect = dr_chain[j + 1];
6925 : :
6926 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6927 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6928 : : first_vect, first_vect,
6929 : : perm2_mask1);
6930 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6931 : 0 : vect[0] = data_ref;
6932 : :
6933 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6934 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6935 : : second_vect, second_vect,
6936 : : perm2_mask2);
6937 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6938 : 0 : vect[1] = data_ref;
6939 : :
6940 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6941 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6942 : : vect[0], vect[1], shift1_mask);
6943 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6944 : 0 : (*result_chain)[j/2 + length/2] = data_ref;
6945 : :
6946 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6947 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6948 : : vect[0], vect[1], select_mask);
6949 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
6950 : 0 : (*result_chain)[j/2] = data_ref;
6951 : : }
6952 : 0 : memcpy (dr_chain.address (), result_chain->address (),
6953 : : length * sizeof (tree));
6954 : : }
6955 : : return true;
6956 : 0 : }
6957 : 0 : if (length == 3 && vf > 2)
6958 : : {
6959 : : unsigned int k = 0, l = 0;
6960 : :
6961 : : /* Generating permutation constant to get all elements in rigth order.
6962 : : For vector length 8 it is {0 3 6 1 4 7 2 5}. */
6963 : 0 : for (i = 0; i < nelt; i++)
6964 : : {
6965 : 0 : if (3 * k + (l % 3) >= nelt)
6966 : : {
6967 : 0 : k = 0;
6968 : 0 : l += (3 - (nelt % 3));
6969 : : }
6970 : 0 : sel[i] = 3 * k + (l % 3);
6971 : 0 : k++;
6972 : : }
6973 : 0 : vec_perm_indices indices (sel, 2, nelt);
6974 : 0 : if (!can_vec_perm_const_p (vmode, vmode, indices))
6975 : : {
6976 : 0 : if (dump_enabled_p ())
6977 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6978 : : "shuffle of 3 fields structure is not \
6979 : : supported by target\n");
6980 : 0 : return false;
6981 : : }
6982 : 0 : perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6983 : :
6984 : : /* Generating permutation constant to shift all elements.
6985 : : For vector length 8 it is {6 7 8 9 10 11 12 13}. */
6986 : 0 : for (i = 0; i < nelt; i++)
6987 : 0 : sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6988 : 0 : indices.new_vector (sel, 2, nelt);
6989 : 0 : if (!can_vec_perm_const_p (vmode, vmode, indices))
6990 : : {
6991 : 0 : if (dump_enabled_p ())
6992 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6993 : : "shift permutation is not supported by target\n");
6994 : 0 : return false;
6995 : : }
6996 : 0 : shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6997 : :
6998 : : /* Generating permutation constant to shift all elements.
6999 : : For vector length 8 it is {5 6 7 8 9 10 11 12}. */
7000 : 0 : for (i = 0; i < nelt; i++)
7001 : 0 : sel[i] = 2 * (nelt / 3) + 1 + i;
7002 : 0 : indices.new_vector (sel, 2, nelt);
7003 : 0 : if (!can_vec_perm_const_p (vmode, vmode, indices))
7004 : : {
7005 : 0 : if (dump_enabled_p ())
7006 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7007 : : "shift permutation is not supported by target\n");
7008 : 0 : return false;
7009 : : }
7010 : 0 : shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
7011 : :
7012 : : /* Generating permutation constant to shift all elements.
7013 : : For vector length 8 it is {3 4 5 6 7 8 9 10}. */
7014 : 0 : for (i = 0; i < nelt; i++)
7015 : 0 : sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
7016 : 0 : indices.new_vector (sel, 2, nelt);
7017 : 0 : if (!can_vec_perm_const_p (vmode, vmode, indices))
7018 : : {
7019 : 0 : if (dump_enabled_p ())
7020 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7021 : : "shift permutation is not supported by target\n");
7022 : 0 : return false;
7023 : : }
7024 : 0 : shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
7025 : :
7026 : : /* Generating permutation constant to shift all elements.
7027 : : For vector length 8 it is {5 6 7 8 9 10 11 12}. */
7028 : 0 : for (i = 0; i < nelt; i++)
7029 : 0 : sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
7030 : 0 : indices.new_vector (sel, 2, nelt);
7031 : 0 : if (!can_vec_perm_const_p (vmode, vmode, indices))
7032 : : {
7033 : 0 : if (dump_enabled_p ())
7034 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7035 : : "shift permutation is not supported by target\n");
7036 : 0 : return false;
7037 : : }
7038 : 0 : shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
7039 : :
7040 : 0 : for (k = 0; k < 3; k++)
7041 : : {
7042 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
7043 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
7044 : 0 : dr_chain[k], dr_chain[k],
7045 : : perm3_mask);
7046 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7047 : 0 : vect[k] = data_ref;
7048 : : }
7049 : :
7050 : 0 : for (k = 0; k < 3; k++)
7051 : : {
7052 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
7053 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
7054 : 0 : vect[k % 3], vect[(k + 1) % 3],
7055 : : shift1_mask);
7056 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7057 : 0 : vect_shift[k] = data_ref;
7058 : : }
7059 : :
7060 : 0 : for (k = 0; k < 3; k++)
7061 : : {
7062 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
7063 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
7064 : 0 : vect_shift[(4 - k) % 3],
7065 : 0 : vect_shift[(3 - k) % 3],
7066 : : shift2_mask);
7067 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7068 : 0 : vect[k] = data_ref;
7069 : : }
7070 : :
7071 : 0 : (*result_chain)[3 - (nelt % 3)] = vect[2];
7072 : :
7073 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
7074 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
7075 : : vect[0], shift3_mask);
7076 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7077 : 0 : (*result_chain)[nelt % 3] = data_ref;
7078 : :
7079 : 0 : data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
7080 : 0 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
7081 : : vect[1], shift4_mask);
7082 : 0 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
7083 : 0 : (*result_chain)[0] = data_ref;
7084 : 0 : return true;
7085 : 0 : }
7086 : : return false;
7087 : 0 : }
7088 : :
7089 : : /* Function vect_transform_grouped_load.
7090 : :
7091 : : Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
7092 : : to perform their permutation and ascribe the result vectorized statements to
7093 : : the scalar statements.
7094 : : */
7095 : :
7096 : : void
7097 : 0 : vect_transform_grouped_load (vec_info *vinfo, stmt_vec_info stmt_info,
7098 : : vec<tree> dr_chain,
7099 : : int size, gimple_stmt_iterator *gsi)
7100 : : {
7101 : 0 : machine_mode mode;
7102 : 0 : vec<tree> result_chain = vNULL;
7103 : :
7104 : : /* DR_CHAIN contains input data-refs that are a part of the interleaving.
7105 : : RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
7106 : : vectors, that are ready for vector computation. */
7107 : 0 : result_chain.create (size);
7108 : :
7109 : : /* If reassociation width for vector type is 2 or greater target machine can
7110 : : execute 2 or more vector instructions in parallel. Otherwise try to
7111 : : get chain for loads group using vect_shift_permute_load_chain. */
7112 : 0 : mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
7113 : 0 : if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
7114 : 0 : || pow2p_hwi (size)
7115 : 0 : || !vect_shift_permute_load_chain (vinfo, dr_chain, size, stmt_info,
7116 : : gsi, &result_chain))
7117 : 0 : vect_permute_load_chain (vinfo, dr_chain,
7118 : : size, stmt_info, gsi, &result_chain);
7119 : 0 : vect_record_grouped_load_vectors (vinfo, stmt_info, result_chain);
7120 : 0 : result_chain.release ();
7121 : 0 : }
7122 : :
7123 : : /* RESULT_CHAIN contains the output of a group of grouped loads that were
7124 : : generated as part of the vectorization of STMT_INFO. Assign the statement
7125 : : for each vector to the associated scalar statement. */
7126 : :
7127 : : void
7128 : 0 : vect_record_grouped_load_vectors (vec_info *, stmt_vec_info stmt_info,
7129 : : vec<tree> result_chain)
7130 : : {
7131 : 0 : stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
7132 : 0 : unsigned int i, gap_count;
7133 : 0 : tree tmp_data_ref;
7134 : :
7135 : : /* Put a permuted data-ref in the VECTORIZED_STMT field.
7136 : : Since we scan the chain starting from it's first node, their order
7137 : : corresponds the order of data-refs in RESULT_CHAIN. */
7138 : 0 : stmt_vec_info next_stmt_info = first_stmt_info;
7139 : 0 : gap_count = 1;
7140 : 0 : FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
7141 : : {
7142 : 0 : if (!next_stmt_info)
7143 : : break;
7144 : :
7145 : : /* Skip the gaps. Loads created for the gaps will be removed by dead
7146 : : code elimination pass later. No need to check for the first stmt in
7147 : : the group, since it always exists.
7148 : : DR_GROUP_GAP is the number of steps in elements from the previous
7149 : : access (if there is no gap DR_GROUP_GAP is 1). We skip loads that
7150 : : correspond to the gaps. */
7151 : 0 : if (next_stmt_info != first_stmt_info
7152 : 0 : && gap_count < DR_GROUP_GAP (next_stmt_info))
7153 : : {
7154 : 0 : gap_count++;
7155 : 0 : continue;
7156 : : }
7157 : :
7158 : : /* ??? The following needs cleanup after the removal of
7159 : : DR_GROUP_SAME_DR_STMT. */
7160 : 0 : if (next_stmt_info)
7161 : : {
7162 : 0 : gimple *new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
7163 : : /* We assume that if VEC_STMT is not NULL, this is a case of multiple
7164 : : copies, and we put the new vector statement last. */
7165 : 0 : STMT_VINFO_VEC_STMTS (next_stmt_info).safe_push (new_stmt);
7166 : :
7167 : 0 : next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7168 : 0 : gap_count = 1;
7169 : : }
7170 : : }
7171 : 0 : }
7172 : :
7173 : : /* Function vect_force_dr_alignment_p.
7174 : :
7175 : : Returns whether the alignment of a DECL can be forced to be aligned
7176 : : on ALIGNMENT bit boundary. */
7177 : :
7178 : : bool
7179 : 585623 : vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
7180 : : {
7181 : 585623 : if (!VAR_P (decl))
7182 : : return false;
7183 : :
7184 : 191973 : if (decl_in_symtab_p (decl)
7185 : 191973 : && !symtab_node::get (decl)->can_increase_alignment_p ())
7186 : : return false;
7187 : :
7188 : 183475 : if (TREE_STATIC (decl))
7189 : 5797 : return (known_le (alignment,
7190 : 5797 : (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
7191 : : else
7192 : 177678 : return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
7193 : : }
7194 : :
7195 : : /* Return whether the data reference DR_INFO is supported with respect to its
7196 : : alignment.
7197 : : If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
7198 : : it is aligned, i.e., check if it is possible to vectorize it with different
7199 : : alignment. */
7200 : :
7201 : : enum dr_alignment_support
7202 : 2853893 : vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info,
7203 : : tree vectype, int misalignment)
7204 : : {
7205 : 2853893 : data_reference *dr = dr_info->dr;
7206 : 2853893 : stmt_vec_info stmt_info = dr_info->stmt;
7207 : 2853893 : machine_mode mode = TYPE_MODE (vectype);
7208 : 2853893 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7209 : 2853893 : class loop *vect_loop = NULL;
7210 : 2853893 : bool nested_in_vect_loop = false;
7211 : :
7212 : 2853893 : if (misalignment == 0)
7213 : : return dr_aligned;
7214 : 1593265 : else if (dr_info->need_peeling_for_alignment)
7215 : : return dr_unaligned_unsupported;
7216 : :
7217 : : /* For now assume all conditional loads/stores support unaligned
7218 : : access without any special code. */
7219 : 1418398 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
7220 : 7791 : if (gimple_call_internal_p (stmt)
7221 : 7791 : && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
7222 : 3053 : || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
7223 : : return dr_unaligned_supported;
7224 : :
7225 : 1410607 : if (loop_vinfo)
|