Line data Source code
1 : /* Statement Analysis and Transformation for Vectorization
2 : Copyright (C) 2003-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : and Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #include "config.h"
23 : #include "system.h"
24 : #include "coretypes.h"
25 : #include "backend.h"
26 : #include "target.h"
27 : #include "rtl.h"
28 : #include "tree.h"
29 : #include "gimple.h"
30 : #include "ssa.h"
31 : #include "optabs-tree.h"
32 : #include "insn-config.h"
33 : #include "recog.h" /* FIXME: for insn_data */
34 : #include "cgraph.h"
35 : #include "dumpfile.h"
36 : #include "alias.h"
37 : #include "fold-const.h"
38 : #include "stor-layout.h"
39 : #include "tree-eh.h"
40 : #include "gimplify.h"
41 : #include "gimple-iterator.h"
42 : #include "gimplify-me.h"
43 : #include "tree-cfg.h"
44 : #include "tree-ssa-loop-manip.h"
45 : #include "cfgloop.h"
46 : #include "explow.h"
47 : #include "tree-ssa-loop.h"
48 : #include "tree-scalar-evolution.h"
49 : #include "tree-vectorizer.h"
50 : #include "builtins.h"
51 : #include "internal-fn.h"
52 : #include "tree-vector-builder.h"
53 : #include "vec-perm-indices.h"
54 : #include "gimple-range.h"
55 : #include "tree-ssa-loop-niter.h"
56 : #include "gimple-fold.h"
57 : #include "regs.h"
58 : #include "attribs.h"
59 : #include "optabs-libfuncs.h"
60 : #include "tree-dfa.h"
61 :
62 : /* For lang_hooks.types.type_for_mode. */
63 : #include "langhooks.h"
64 :
65 : static tree vector_vector_composition_type (tree, poly_uint64, tree *,
66 : bool = false);
67 :
68 : /* Return TRUE iff the given statement is in an inner loop relative to
69 : the loop being vectorized. */
70 : bool
71 5813199 : stmt_in_inner_loop_p (vec_info *vinfo, class _stmt_vec_info *stmt_info)
72 : {
73 5813199 : gimple *stmt = STMT_VINFO_STMT (stmt_info);
74 5813199 : basic_block bb = gimple_bb (stmt);
75 5813199 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
76 2761581 : class loop* loop;
77 :
78 2761581 : if (!loop_vinfo)
79 : return false;
80 :
81 2761581 : loop = LOOP_VINFO_LOOP (loop_vinfo);
82 :
83 2761581 : return (bb->loop_father == loop->inner);
84 : }
85 :
86 : /* Record the cost of a statement, either by directly informing the
87 : target model or by saving it in a vector for later processing.
88 : Return a preliminary estimate of the statement's cost. */
89 :
90 : unsigned
91 8828558 : record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
92 : enum vect_cost_for_stmt kind,
93 : stmt_vec_info stmt_info, slp_tree node,
94 : tree vectype, int misalign,
95 : enum vect_cost_model_location where)
96 : {
97 8828558 : if ((kind == vector_load || kind == unaligned_load)
98 1582005 : && (stmt_info && STMT_VINFO_GATHER_SCATTER_P (stmt_info)))
99 : kind = vector_gather_load;
100 8828558 : if ((kind == vector_store || kind == unaligned_store)
101 1020042 : && (stmt_info && STMT_VINFO_GATHER_SCATTER_P (stmt_info)))
102 8828558 : kind = vector_scatter_store;
103 :
104 8828558 : stmt_info_for_cost si
105 8828558 : = { count, kind, where, stmt_info, node, vectype, misalign };
106 8828558 : body_cost_vec->safe_push (si);
107 :
108 8828558 : return (unsigned)
109 8828558 : (builtin_vectorization_cost (kind, vectype, misalign) * count);
110 : }
111 :
112 : unsigned
113 3977342 : record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
114 : enum vect_cost_for_stmt kind, stmt_vec_info stmt_info,
115 : tree vectype, int misalign,
116 : enum vect_cost_model_location where)
117 : {
118 3977342 : return record_stmt_cost (body_cost_vec, count, kind, stmt_info, NULL,
119 3977342 : vectype, misalign, where);
120 : }
121 :
122 : unsigned
123 1787066 : record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
124 : enum vect_cost_for_stmt kind, slp_tree node,
125 : tree vectype, int misalign,
126 : enum vect_cost_model_location where)
127 : {
128 1787066 : return record_stmt_cost (body_cost_vec, count, kind,
129 : SLP_TREE_REPRESENTATIVE (node), node,
130 1787066 : vectype, misalign, where);
131 : }
132 :
133 : unsigned
134 0 : record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
135 : enum vect_cost_for_stmt kind,
136 : enum vect_cost_model_location where)
137 : {
138 0 : gcc_assert (kind == cond_branch_taken || kind == cond_branch_not_taken
139 : || kind == scalar_stmt);
140 0 : return record_stmt_cost (body_cost_vec, count, kind, NULL, NULL,
141 0 : NULL_TREE, 0, where);
142 : }
143 :
144 : /* Return a variable of type ELEM_TYPE[NELEMS]. */
145 :
146 : static tree
147 0 : create_vector_array (tree elem_type, unsigned HOST_WIDE_INT nelems)
148 : {
149 0 : return create_tmp_var (build_array_type_nelts (elem_type, nelems),
150 0 : "vect_array");
151 : }
152 :
153 : /* ARRAY is an array of vectors created by create_vector_array.
154 : Return an SSA_NAME for the vector in index N. The reference
155 : is part of the vectorization of STMT_INFO and the vector is associated
156 : with scalar destination SCALAR_DEST.
157 : If we need to ensure that inactive elements are set to zero,
158 : NEED_ZEROING is true, MASK contains the loop mask to be used. */
159 :
160 : static tree
161 0 : read_vector_array (vec_info *vinfo,
162 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
163 : tree scalar_dest, tree array, unsigned HOST_WIDE_INT n,
164 : bool need_zeroing, tree mask)
165 : {
166 0 : tree vect_type, vect, vect_name, tmp, tmp_name, array_ref;
167 0 : gimple *new_stmt;
168 :
169 0 : gcc_assert (TREE_CODE (TREE_TYPE (array)) == ARRAY_TYPE);
170 0 : vect_type = TREE_TYPE (TREE_TYPE (array));
171 0 : tmp = vect_create_destination_var (scalar_dest, vect_type);
172 0 : vect = vect_create_destination_var (scalar_dest, vect_type);
173 0 : array_ref = build4 (ARRAY_REF, vect_type, array,
174 0 : build_int_cst (size_type_node, n),
175 : NULL_TREE, NULL_TREE);
176 :
177 0 : new_stmt = gimple_build_assign (tmp, array_ref);
178 0 : tmp_name = make_ssa_name (vect, new_stmt);
179 0 : gimple_assign_set_lhs (new_stmt, tmp_name);
180 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
181 :
182 0 : if (need_zeroing)
183 : {
184 0 : tree vec_els = vect_get_mask_load_else (MASK_LOAD_ELSE_ZERO,
185 : vect_type);
186 0 : vect_name = make_ssa_name (vect, new_stmt);
187 0 : new_stmt
188 0 : = gimple_build_assign (vect_name, VEC_COND_EXPR,
189 : mask, tmp_name, vec_els);
190 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
191 : }
192 : else
193 : vect_name = tmp_name;
194 :
195 0 : return vect_name;
196 : }
197 :
198 : /* ARRAY is an array of vectors created by create_vector_array.
199 : Emit code to store SSA_NAME VECT in index N of the array.
200 : The store is part of the vectorization of STMT_INFO. */
201 :
202 : static void
203 0 : write_vector_array (vec_info *vinfo,
204 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
205 : tree vect, tree array, unsigned HOST_WIDE_INT n)
206 : {
207 0 : tree array_ref;
208 0 : gimple *new_stmt;
209 :
210 0 : array_ref = build4 (ARRAY_REF, TREE_TYPE (vect), array,
211 0 : build_int_cst (size_type_node, n),
212 : NULL_TREE, NULL_TREE);
213 :
214 0 : new_stmt = gimple_build_assign (array_ref, vect);
215 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
216 0 : }
217 :
218 : /* PTR is a pointer to an array of type TYPE. Return a representation
219 : of *PTR. The memory reference replaces those in FIRST_DR
220 : (and its group). */
221 :
222 : static tree
223 0 : create_array_ref (tree type, tree ptr, tree alias_ptr_type)
224 : {
225 0 : tree mem_ref;
226 :
227 0 : mem_ref = build2 (MEM_REF, type, ptr, build_int_cst (alias_ptr_type, 0));
228 : /* Arrays have the same alignment as their type. */
229 0 : set_ptr_info_alignment (get_ptr_info (ptr), TYPE_ALIGN_UNIT (type), 0);
230 0 : return mem_ref;
231 : }
232 :
233 : /* Add a clobber of variable VAR to the vectorization of STMT_INFO.
234 : Emit the clobber before *GSI. */
235 :
236 : static void
237 15 : vect_clobber_variable (vec_info *vinfo, stmt_vec_info stmt_info,
238 : gimple_stmt_iterator *gsi, tree var)
239 : {
240 15 : tree clobber = build_clobber (TREE_TYPE (var));
241 15 : gimple *new_stmt = gimple_build_assign (var, clobber);
242 15 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
243 15 : }
244 :
245 : /* Utility functions used by vect_mark_stmts_to_be_vectorized. */
246 :
247 : /* Function vect_mark_relevant.
248 :
249 : Mark STMT_INFO as "relevant for vectorization" and add it to WORKLIST. */
250 :
251 : static void
252 3208957 : vect_mark_relevant (vec<stmt_vec_info> *worklist, stmt_vec_info stmt_info,
253 : enum vect_relevant relevant, bool live_p)
254 : {
255 3208957 : enum vect_relevant save_relevant = STMT_VINFO_RELEVANT (stmt_info);
256 3208957 : bool save_live_p = STMT_VINFO_LIVE_P (stmt_info);
257 :
258 3208957 : if (dump_enabled_p ())
259 164206 : dump_printf_loc (MSG_NOTE, vect_location,
260 : "mark relevant %d, live %d: %G", relevant, live_p,
261 : stmt_info->stmt);
262 :
263 : /* If this stmt is an original stmt in a pattern, we might need to mark its
264 : related pattern stmt instead of the original stmt. However, such stmts
265 : may have their own uses that are not in any pattern, in such cases the
266 : stmt itself should be marked. */
267 3208957 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
268 : {
269 : /* This is the last stmt in a sequence that was detected as a
270 : pattern that can potentially be vectorized. Don't mark the stmt
271 : as relevant/live because it's not going to be vectorized.
272 : Instead mark the pattern-stmt that replaces it. */
273 :
274 242378 : if (dump_enabled_p ())
275 2811 : dump_printf_loc (MSG_NOTE, vect_location,
276 : "last stmt in pattern. don't mark"
277 : " relevant/live.\n");
278 :
279 242378 : stmt_vec_info old_stmt_info = stmt_info;
280 242378 : stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
281 242378 : gcc_assert (STMT_VINFO_RELATED_STMT (stmt_info) == old_stmt_info);
282 242378 : save_relevant = STMT_VINFO_RELEVANT (stmt_info);
283 242378 : save_live_p = STMT_VINFO_LIVE_P (stmt_info);
284 :
285 242378 : if (live_p && relevant == vect_unused_in_scope)
286 : {
287 110 : if (dump_enabled_p ())
288 10 : dump_printf_loc (MSG_NOTE, vect_location,
289 : "vec_stmt_relevant_p: forcing live pattern stmt "
290 : "relevant.\n");
291 : relevant = vect_used_only_live;
292 : }
293 :
294 242378 : if (dump_enabled_p ())
295 2811 : dump_printf_loc (MSG_NOTE, vect_location,
296 : "mark relevant %d, live %d: %G", relevant, live_p,
297 : stmt_info->stmt);
298 : }
299 :
300 3208957 : STMT_VINFO_LIVE_P (stmt_info) |= live_p;
301 3208957 : if (relevant > STMT_VINFO_RELEVANT (stmt_info))
302 2878197 : STMT_VINFO_RELEVANT (stmt_info) = relevant;
303 :
304 3208957 : if (STMT_VINFO_RELEVANT (stmt_info) == save_relevant
305 330760 : && STMT_VINFO_LIVE_P (stmt_info) == save_live_p)
306 : {
307 330045 : if (dump_enabled_p ())
308 19616 : dump_printf_loc (MSG_NOTE, vect_location,
309 : "already marked relevant/live.\n");
310 330045 : return;
311 : }
312 :
313 2878912 : worklist->safe_push (stmt_info);
314 : }
315 :
316 :
317 : /* Function is_simple_and_all_uses_invariant
318 :
319 : Return true if STMT_INFO is simple and all uses of it are invariant. */
320 :
321 : bool
322 248134 : is_simple_and_all_uses_invariant (stmt_vec_info stmt_info,
323 : loop_vec_info loop_vinfo)
324 : {
325 248134 : tree op;
326 248134 : ssa_op_iter iter;
327 :
328 442052 : gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
329 194743 : if (!stmt)
330 : return false;
331 :
332 202298 : FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
333 : {
334 201473 : enum vect_def_type dt = vect_uninitialized_def;
335 :
336 201473 : if (!vect_is_simple_use (op, loop_vinfo, &dt))
337 : {
338 5326 : if (dump_enabled_p ())
339 16 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
340 : "use not simple.\n");
341 193918 : return false;
342 : }
343 :
344 196147 : if (dt != vect_external_def && dt != vect_constant_def)
345 : return false;
346 : }
347 : return true;
348 : }
349 :
350 : /* Function vect_stmt_relevant_p.
351 :
352 : Return true if STMT_INFO, in the loop that is represented by LOOP_VINFO,
353 : is "relevant for vectorization".
354 :
355 : A stmt is considered "relevant for vectorization" if:
356 : - it has uses outside the loop.
357 : - it has vdefs (it alters memory).
358 : - control stmts in the loop (except for the exit condition).
359 :
360 : CHECKME: what other side effects would the vectorizer allow? */
361 :
362 : static bool
363 5159901 : vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
364 : enum vect_relevant *relevant, bool *live_p)
365 : {
366 5159901 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
367 5159901 : ssa_op_iter op_iter;
368 5159901 : imm_use_iterator imm_iter;
369 5159901 : use_operand_p use_p;
370 5159901 : def_operand_p def_p;
371 :
372 5159901 : *relevant = vect_unused_in_scope;
373 5159901 : *live_p = false;
374 :
375 : /* cond stmt other than loop exit cond. */
376 5159901 : gimple *stmt = STMT_VINFO_STMT (stmt_info);
377 5159901 : if (is_ctrl_stmt (stmt)
378 605599 : && LOOP_VINFO_LOOP_IV_COND (loop_vinfo) != stmt
379 5388870 : && (!loop->inner || gimple_bb (stmt)->loop_father == loop))
380 226984 : *relevant = vect_used_in_scope;
381 :
382 : /* changing memory. */
383 5159901 : if (gimple_code (stmt_info->stmt) != GIMPLE_PHI)
384 4279790 : if (gimple_vdef (stmt_info->stmt)
385 3674191 : && !gimple_clobber_p (stmt_info->stmt))
386 : {
387 369431 : if (dump_enabled_p ())
388 28039 : dump_printf_loc (MSG_NOTE, vect_location,
389 : "vec_stmt_relevant_p: stmt has vdefs.\n");
390 369431 : *relevant = vect_used_in_scope;
391 369431 : if (! STMT_VINFO_DATA_REF (stmt_info)
392 369431 : && zero_ssa_operands (stmt_info->stmt, SSA_OP_DEF))
393 20 : LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo).safe_push (stmt_info);
394 : }
395 :
396 : /* uses outside the loop. */
397 14498192 : FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
398 : {
399 15359063 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter, DEF_FROM_PTR (def_p))
400 : {
401 7002283 : basic_block bb = gimple_bb (USE_STMT (use_p));
402 7002283 : if (!flow_bb_inside_loop_p (loop, bb))
403 : {
404 262889 : if (is_gimple_debug (USE_STMT (use_p)))
405 1093 : continue;
406 :
407 261796 : if (dump_enabled_p ())
408 5987 : dump_printf_loc (MSG_NOTE, vect_location,
409 : "vec_stmt_relevant_p: used out of loop.\n");
410 :
411 : /* We expect all such uses to be in the loop exit phis
412 : (because of loop closed form) */
413 261796 : gcc_assert (gimple_code (USE_STMT (use_p)) == GIMPLE_PHI);
414 :
415 261796 : *live_p = true;
416 261796 : LOOP_VINFO_EARLY_BRK_NEEDS_EPILOG (loop_vinfo) = true;
417 : }
418 4178390 : }
419 : }
420 :
421 248136 : if (*live_p && *relevant == vect_unused_in_scope
422 5408035 : && !is_simple_and_all_uses_invariant (stmt_info, loop_vinfo))
423 : {
424 247309 : if (dump_enabled_p ())
425 5843 : dump_printf_loc (MSG_NOTE, vect_location,
426 : "vec_stmt_relevant_p: stmt live but not relevant.\n");
427 247309 : *relevant = vect_used_only_live;
428 : }
429 :
430 5159901 : return (*live_p || *relevant);
431 : }
432 :
433 :
434 : /* Function exist_non_indexing_operands_for_use_p
435 :
436 : USE is one of the uses attached to STMT_INFO. Check if USE is
437 : used in STMT_INFO for anything other than indexing an array. */
438 :
439 : static bool
440 4301722 : exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info)
441 : {
442 4301722 : tree operand;
443 :
444 : /* USE corresponds to some operand in STMT. If there is no data
445 : reference in STMT, then any operand that corresponds to USE
446 : is not indexing an array. */
447 4301722 : if (!STMT_VINFO_DATA_REF (stmt_info))
448 : return true;
449 :
450 : /* STMT has a data_ref. FORNOW this means that its of one of
451 : the following forms:
452 : -1- ARRAY_REF = var
453 : -2- var = ARRAY_REF
454 : (This should have been verified in analyze_data_refs).
455 :
456 : 'var' in the second case corresponds to a def, not a use,
457 : so USE cannot correspond to any operands that are not used
458 : for array indexing.
459 :
460 : Therefore, all we need to check is if STMT falls into the
461 : first case, and whether var corresponds to USE. */
462 :
463 1469152 : gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
464 1450946 : if (!assign || !gimple_assign_copy_p (assign))
465 : {
466 787144 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
467 18206 : if (call && gimple_call_internal_p (call))
468 : {
469 18206 : internal_fn ifn = gimple_call_internal_fn (call);
470 18206 : int mask_index = internal_fn_mask_index (ifn);
471 18206 : if (mask_index >= 0
472 18206 : && use == gimple_call_arg (call, mask_index))
473 : return true;
474 11817 : int els_index = internal_fn_else_index (ifn);
475 11817 : if (els_index >= 0
476 11817 : && use == gimple_call_arg (call, els_index))
477 : return true;
478 10312 : int stored_value_index = internal_fn_stored_value_index (ifn);
479 10312 : if (stored_value_index >= 0
480 10312 : && use == gimple_call_arg (call, stored_value_index))
481 : return true;
482 8090 : if (internal_gather_scatter_fn_p (ifn)
483 8090 : && use == gimple_call_arg (call, 1))
484 : return true;
485 : }
486 777028 : return false;
487 : }
488 :
489 682008 : if (TREE_CODE (gimple_assign_lhs (assign)) == SSA_NAME)
490 : return false;
491 682008 : operand = gimple_assign_rhs1 (assign);
492 682008 : if (TREE_CODE (operand) != SSA_NAME)
493 : return false;
494 :
495 590609 : if (operand == use)
496 : return true;
497 :
498 : return false;
499 : }
500 :
501 :
502 : /*
503 : Function process_use.
504 :
505 : Inputs:
506 : - a USE in STMT_VINFO in a loop represented by LOOP_VINFO
507 : - RELEVANT - enum value to be set in the STMT_VINFO of the stmt
508 : that defined USE. This is done by calling mark_relevant and passing it
509 : the WORKLIST (to add DEF_STMT to the WORKLIST in case it is relevant).
510 : - FORCE is true if exist_non_indexing_operands_for_use_p check shouldn't
511 : be performed.
512 :
513 : Outputs:
514 : Generally, LIVE_P and RELEVANT are used to define the liveness and
515 : relevance info of the DEF_STMT of this USE:
516 : STMT_VINFO_LIVE_P (DEF_stmt_vinfo) <-- live_p
517 : STMT_VINFO_RELEVANT (DEF_stmt_vinfo) <-- relevant
518 : Exceptions:
519 : - case 1: If USE is used only for address computations (e.g. array indexing),
520 : which does not need to be directly vectorized, then the liveness/relevance
521 : of the respective DEF_STMT is left unchanged.
522 : - case 2: If STMT_VINFO is a reduction phi and DEF_STMT is a reduction stmt,
523 : we skip DEF_STMT cause it had already been processed.
524 : - case 3: If DEF_STMT and STMT_VINFO are in different nests, then
525 : "relevant" will be modified accordingly.
526 :
527 : Return true if everything is as expected. Return false otherwise. */
528 :
529 : static opt_result
530 4357685 : process_use (stmt_vec_info stmt_vinfo, tree use, loop_vec_info loop_vinfo,
531 : enum vect_relevant relevant, vec<stmt_vec_info> *worklist,
532 : bool force)
533 : {
534 4357685 : stmt_vec_info dstmt_vinfo;
535 4357685 : enum vect_def_type dt;
536 :
537 : /* case 1: we are only interested in uses that need to be vectorized. Uses
538 : that are used for address computation are not considered relevant. */
539 4357685 : if (!force && !exist_non_indexing_operands_for_use_p (use, stmt_vinfo))
540 1180077 : return opt_result::success ();
541 :
542 3177608 : if (!vect_is_simple_use (use, loop_vinfo, &dt, &dstmt_vinfo))
543 34802 : return opt_result::failure_at (stmt_vinfo->stmt,
544 : "not vectorized:"
545 : " unsupported use in stmt.\n");
546 :
547 3142806 : if (!dstmt_vinfo)
548 590743 : return opt_result::success ();
549 :
550 2552063 : basic_block def_bb = gimple_bb (dstmt_vinfo->stmt);
551 2552063 : basic_block bb = gimple_bb (stmt_vinfo->stmt);
552 :
553 : /* case 2: A reduction phi (STMT) defined by a reduction stmt (DSTMT_VINFO).
554 : We have to force the stmt live since the epilogue loop needs it to
555 : continue computing the reduction. */
556 2552063 : if (gimple_code (stmt_vinfo->stmt) == GIMPLE_PHI
557 268774 : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
558 84837 : && gimple_code (dstmt_vinfo->stmt) != GIMPLE_PHI
559 84837 : && STMT_VINFO_DEF_TYPE (dstmt_vinfo) == vect_reduction_def
560 2636900 : && bb->loop_father == def_bb->loop_father)
561 : {
562 84837 : if (dump_enabled_p ())
563 3930 : dump_printf_loc (MSG_NOTE, vect_location,
564 : "reduc-stmt defining reduc-phi in the same nest.\n");
565 84837 : vect_mark_relevant (worklist, dstmt_vinfo, relevant, true);
566 84837 : return opt_result::success ();
567 : }
568 :
569 : /* case 3a: outer-loop stmt defining an inner-loop stmt:
570 : outer-loop-header-bb:
571 : d = dstmt_vinfo
572 : inner-loop:
573 : stmt # use (d)
574 : outer-loop-tail-bb:
575 : ... */
576 2467226 : if (flow_loop_nested_p (def_bb->loop_father, bb->loop_father))
577 : {
578 2237 : if (dump_enabled_p ())
579 321 : dump_printf_loc (MSG_NOTE, vect_location,
580 : "outer-loop def-stmt defining inner-loop stmt.\n");
581 :
582 2237 : switch (relevant)
583 : {
584 0 : case vect_unused_in_scope:
585 0 : relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_nested_cycle) ?
586 : vect_used_in_scope : vect_unused_in_scope;
587 : break;
588 :
589 776 : case vect_used_in_outer_by_reduction:
590 776 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def);
591 : relevant = vect_used_by_reduction;
592 : break;
593 :
594 1181 : case vect_used_in_outer:
595 1181 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_reduction_def);
596 : relevant = vect_used_in_scope;
597 : break;
598 :
599 : case vect_used_in_scope:
600 : break;
601 :
602 0 : default:
603 0 : gcc_unreachable ();
604 : }
605 : }
606 :
607 : /* case 3b: inner-loop stmt defining an outer-loop stmt:
608 : outer-loop-header-bb:
609 : ...
610 : inner-loop:
611 : d = dstmt_vinfo
612 : outer-loop-tail-bb (or outer-loop-exit-bb in double reduction):
613 : stmt # use (d) */
614 2464989 : else if (flow_loop_nested_p (bb->loop_father, def_bb->loop_father))
615 : {
616 2100 : if (dump_enabled_p ())
617 626 : dump_printf_loc (MSG_NOTE, vect_location,
618 : "inner-loop def-stmt defining outer-loop stmt.\n");
619 :
620 2100 : switch (relevant)
621 : {
622 0 : case vect_unused_in_scope:
623 0 : relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
624 0 : || STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_double_reduction_def) ?
625 : vect_used_in_outer_by_reduction : vect_unused_in_scope;
626 : break;
627 :
628 : case vect_used_by_reduction:
629 : case vect_used_only_live:
630 : relevant = vect_used_in_outer_by_reduction;
631 : break;
632 :
633 : case vect_used_in_scope:
634 2289659 : relevant = vect_used_in_outer;
635 : break;
636 :
637 0 : default:
638 0 : gcc_unreachable ();
639 : }
640 : }
641 : /* We are also not interested in uses on loop PHI backedges that are
642 : inductions. Otherwise we'll needlessly vectorize the IV increment
643 : and cause hybrid SLP for SLP inductions. */
644 2462889 : else if (gimple_code (stmt_vinfo->stmt) == GIMPLE_PHI
645 180603 : && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_induction_def
646 2640456 : && (PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
647 : loop_latch_edge (bb->loop_father))
648 : == use))
649 : {
650 177567 : if (dump_enabled_p ())
651 4868 : dump_printf_loc (MSG_NOTE, vect_location,
652 : "induction value on backedge.\n");
653 177567 : return opt_result::success ();
654 : }
655 :
656 2289659 : vect_mark_relevant (worklist, dstmt_vinfo, relevant, false);
657 2289659 : return opt_result::success ();
658 : }
659 :
660 :
661 : /* Function vect_mark_stmts_to_be_vectorized.
662 :
663 : Not all stmts in the loop need to be vectorized. For example:
664 :
665 : for i...
666 : for j...
667 : 1. T0 = i + j
668 : 2. T1 = a[T0]
669 :
670 : 3. j = j + 1
671 :
672 : Stmt 1 and 3 do not need to be vectorized, because loop control and
673 : addressing of vectorized data-refs are handled differently.
674 :
675 : This pass detects such stmts. */
676 :
677 : opt_result
678 432442 : vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo, bool *fatal)
679 : {
680 432442 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
681 432442 : basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
682 432442 : unsigned int nbbs = loop->num_nodes;
683 432442 : gimple_stmt_iterator si;
684 432442 : unsigned int i;
685 432442 : basic_block bb;
686 432442 : bool live_p;
687 432442 : enum vect_relevant relevant;
688 :
689 432442 : DUMP_VECT_SCOPE ("vect_mark_stmts_to_be_vectorized");
690 :
691 432442 : auto_vec<stmt_vec_info, 64> worklist;
692 :
693 : /* 1. Init worklist. */
694 1464245 : for (i = 0; i < nbbs; i++)
695 : {
696 1042035 : bb = bbs[i];
697 2142236 : for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
698 : {
699 2220578 : if (virtual_operand_p (gimple_phi_result (gsi_stmt (si))))
700 230178 : continue;
701 880111 : stmt_vec_info phi_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
702 880111 : if (dump_enabled_p ())
703 41647 : dump_printf_loc (MSG_NOTE, vect_location, "init: phi relevant? %G",
704 : phi_info->stmt);
705 :
706 880111 : if (vect_stmt_relevant_p (phi_info, loop_vinfo, &relevant, &live_p))
707 : {
708 44509 : if (STMT_VINFO_DEF_TYPE (phi_info) == vect_unknown_def_type)
709 10088 : return opt_result::failure_at
710 10088 : (*si, "not vectorized: unhandled relevant PHI: %G", *si);
711 34421 : vect_mark_relevant (&worklist, phi_info, relevant, live_p);
712 : }
713 : }
714 8252656 : for (si = gsi_after_labels (bb); !gsi_end_p (si); gsi_next (&si))
715 : {
716 7220853 : gimple *stmt = gsi_stmt (si);
717 7220853 : if (is_gimple_debug (stmt))
718 2940919 : continue;
719 4279934 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
720 4279934 : if (dump_enabled_p ())
721 223156 : dump_printf_loc (MSG_NOTE, vect_location,
722 : "init: stmt relevant? %G", stmt);
723 :
724 4279934 : if (gimple_get_lhs (stmt) == NULL_TREE
725 611898 : && !is_a <gcond *> (stmt)
726 4286233 : && !is_a <gcall *> (stmt))
727 144 : return opt_result::failure_at
728 144 : (stmt, "not vectorized: irregular stmt: %G", stmt);
729 :
730 4279790 : if (vect_stmt_relevant_p (stmt_info, loop_vinfo, &relevant, &live_p))
731 800040 : vect_mark_relevant (&worklist, stmt_info, relevant, live_p);
732 : }
733 : }
734 :
735 : /* 2. Process_worklist */
736 3177624 : while (worklist.length () > 0)
737 : {
738 2790218 : use_operand_p use_p;
739 2790218 : ssa_op_iter iter;
740 :
741 2790218 : stmt_vec_info stmt_vinfo = worklist.pop ();
742 2790218 : if (dump_enabled_p ())
743 143950 : dump_printf_loc (MSG_NOTE, vect_location,
744 : "worklist: examine stmt: %G", stmt_vinfo->stmt);
745 :
746 : /* Examine the USEs of STMT. For each USE, mark the stmt that defines it
747 : (DEF_STMT) as relevant/irrelevant according to the relevance property
748 : of STMT. */
749 2790218 : relevant = STMT_VINFO_RELEVANT (stmt_vinfo);
750 :
751 : /* Generally, the relevance property of STMT (in STMT_VINFO_RELEVANT) is
752 : propagated as is to the DEF_STMTs of its USEs.
753 :
754 : One exception is when STMT has been identified as defining a reduction
755 : variable; in this case we set the relevance to vect_used_by_reduction.
756 : This is because we distinguish between two kinds of relevant stmts -
757 : those that are used by a reduction computation, and those that are
758 : (also) used by a regular computation. This allows us later on to
759 : identify stmts that are used solely by a reduction, and therefore the
760 : order of the results that they produce does not have to be kept. */
761 :
762 2790218 : switch (STMT_VINFO_DEF_TYPE (stmt_vinfo))
763 : {
764 171769 : case vect_reduction_def:
765 171769 : gcc_assert (relevant != vect_unused_in_scope);
766 171769 : if (relevant != vect_unused_in_scope
767 171769 : && relevant != vect_used_in_scope
768 171769 : && relevant != vect_used_by_reduction
769 171769 : && relevant != vect_used_only_live)
770 0 : return opt_result::failure_at
771 0 : (stmt_vinfo->stmt, "unsupported use of reduction.\n");
772 : break;
773 :
774 2209 : case vect_nested_cycle:
775 2209 : if (relevant != vect_unused_in_scope
776 2209 : && relevant != vect_used_in_outer_by_reduction
777 1614 : && relevant != vect_used_in_outer)
778 2 : return opt_result::failure_at
779 2 : (stmt_vinfo->stmt, "unsupported use of nested cycle.\n");
780 : break;
781 :
782 1209 : case vect_double_reduction_def:
783 1209 : if (relevant != vect_unused_in_scope
784 1209 : && relevant != vect_used_by_reduction
785 409 : && relevant != vect_used_only_live)
786 0 : return opt_result::failure_at
787 0 : (stmt_vinfo->stmt, "unsupported use of double reduction.\n");
788 : break;
789 :
790 : default:
791 : break;
792 : }
793 :
794 2790216 : if (is_pattern_stmt_p (stmt_vinfo))
795 : {
796 : /* Pattern statements are not inserted into the code, so
797 : FOR_EACH_PHI_OR_STMT_USE optimizes their operands out, and we
798 : have to scan the RHS or function arguments instead. */
799 624764 : if (gassign *assign = dyn_cast <gassign *> (stmt_vinfo->stmt))
800 : {
801 406979 : enum tree_code rhs_code = gimple_assign_rhs_code (assign);
802 406979 : tree op = gimple_assign_rhs1 (assign);
803 :
804 406979 : i = 1;
805 406979 : if (rhs_code == COND_EXPR && COMPARISON_CLASS_P (op))
806 : {
807 0 : opt_result res
808 0 : = process_use (stmt_vinfo, TREE_OPERAND (op, 0),
809 : loop_vinfo, relevant, &worklist, false);
810 0 : if (!res)
811 0 : return res;
812 0 : res = process_use (stmt_vinfo, TREE_OPERAND (op, 1),
813 : loop_vinfo, relevant, &worklist, false);
814 0 : if (!res)
815 0 : return res;
816 : i = 2;
817 : }
818 1171658 : for (; i < gimple_num_ops (assign); i++)
819 : {
820 768433 : op = gimple_op (assign, i);
821 768433 : if (TREE_CODE (op) == SSA_NAME)
822 : {
823 584446 : opt_result res
824 584446 : = process_use (stmt_vinfo, op, loop_vinfo, relevant,
825 : &worklist, false);
826 584446 : if (!res)
827 3754 : return res;
828 : }
829 : }
830 : }
831 217785 : else if (gcond *cond = dyn_cast <gcond *> (stmt_vinfo->stmt))
832 : {
833 211158 : tree_code rhs_code = gimple_cond_code (cond);
834 211158 : gcc_assert (TREE_CODE_CLASS (rhs_code) == tcc_comparison);
835 211158 : opt_result res
836 211158 : = process_use (stmt_vinfo, gimple_cond_lhs (cond),
837 : loop_vinfo, relevant, &worklist, false);
838 211158 : if (!res)
839 34804 : return res;
840 211158 : res = process_use (stmt_vinfo, gimple_cond_rhs (cond),
841 : loop_vinfo, relevant, &worklist, false);
842 211158 : if (!res)
843 0 : return res;
844 : }
845 6627 : else if (gcall *call = dyn_cast <gcall *> (stmt_vinfo->stmt))
846 : {
847 31687 : for (i = 0; i < gimple_call_num_args (call); i++)
848 : {
849 25060 : tree arg = gimple_call_arg (call, i);
850 25060 : opt_result res
851 25060 : = process_use (stmt_vinfo, arg, loop_vinfo, relevant,
852 : &worklist, false);
853 25060 : if (!res)
854 0 : return res;
855 : }
856 : }
857 : else
858 0 : gcc_unreachable ();
859 : }
860 : else
861 7582949 : FOR_EACH_PHI_OR_STMT_USE (use_p, stmt_vinfo->stmt, iter, SSA_OP_USE)
862 : {
863 3269900 : tree op = USE_FROM_PTR (use_p);
864 3269900 : opt_result res
865 3269900 : = process_use (stmt_vinfo, op, loop_vinfo, relevant,
866 : &worklist, false);
867 3269900 : if (!res)
868 17855 : return res;
869 : }
870 :
871 2768607 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo))
872 : {
873 55963 : gather_scatter_info gs_info;
874 55963 : if (!vect_check_gather_scatter (stmt_vinfo,
875 : STMT_VINFO_VECTYPE (stmt_vinfo),
876 : loop_vinfo, &gs_info))
877 0 : gcc_unreachable ();
878 55963 : opt_result res
879 55963 : = process_use (stmt_vinfo, gs_info.offset, loop_vinfo, relevant,
880 : &worklist, true);
881 55963 : if (!res)
882 : {
883 13193 : if (fatal)
884 13193 : *fatal = false;
885 13193 : return res;
886 : }
887 : }
888 : } /* while worklist */
889 :
890 387406 : return opt_result::success ();
891 432442 : }
892 :
893 : /* Function vect_model_simple_cost.
894 :
895 : Models cost for simple operations, i.e. those that only emit N operations
896 : of the same KIND. */
897 :
898 : static void
899 773633 : vect_model_simple_cost (vec_info *vinfo, int n, slp_tree node,
900 : stmt_vector_for_cost *cost_vec,
901 : vect_cost_for_stmt kind = vector_stmt)
902 : {
903 773633 : int inside_cost = 0, prologue_cost = 0;
904 :
905 773633 : gcc_assert (cost_vec != NULL);
906 :
907 773633 : n *= vect_get_num_copies (vinfo, node);
908 :
909 : /* Pass the inside-of-loop statements to the target-specific cost model. */
910 773633 : inside_cost += record_stmt_cost (cost_vec, n, kind, node, 0, vect_body);
911 :
912 773633 : if (dump_enabled_p ())
913 33237 : dump_printf_loc (MSG_NOTE, vect_location,
914 : "vect_model_simple_cost: inside_cost = %d, "
915 : "prologue_cost = %d .\n", inside_cost, prologue_cost);
916 773633 : }
917 :
918 :
919 : /* Model cost for type demotion and promotion operations. PWR is
920 : normally zero for single-step promotions and demotions. It will be
921 : one if two-step promotion/demotion is required, and so on. NCOPIES
922 : is the number of vector results (and thus number of instructions)
923 : for the narrowest end of the operation chain. Each additional
924 : step doubles the number of instructions required. If WIDEN_ARITH
925 : is true the stmt is doing widening arithmetic. */
926 :
927 : static void
928 68423 : vect_model_promotion_demotion_cost (slp_tree slp_node,
929 : unsigned int ncopies, int pwr,
930 : stmt_vector_for_cost *cost_vec,
931 : bool widen_arith)
932 : {
933 68423 : int i;
934 68423 : int inside_cost = 0, prologue_cost = 0;
935 :
936 159665 : for (i = 0; i < pwr + 1; i++)
937 : {
938 180766 : inside_cost += record_stmt_cost (cost_vec, ncopies,
939 : widen_arith
940 : ? vector_stmt : vec_promote_demote,
941 : slp_node, 0, vect_body);
942 91242 : ncopies *= 2;
943 : }
944 :
945 68423 : if (dump_enabled_p ())
946 6384 : dump_printf_loc (MSG_NOTE, vect_location,
947 : "vect_model_promotion_demotion_cost: inside_cost = %d, "
948 : "prologue_cost = %d .\n", inside_cost, prologue_cost);
949 68423 : }
950 :
951 : /* Returns true if the current function returns DECL. */
952 :
953 : static bool
954 557674 : cfun_returns (tree decl)
955 : {
956 557674 : edge_iterator ei;
957 557674 : edge e;
958 1098083 : FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
959 : {
960 1104162 : greturn *ret = safe_dyn_cast <greturn *> (*gsi_last_bb (e->src));
961 552081 : if (!ret)
962 0 : continue;
963 552081 : if (gimple_return_retval (ret) == decl)
964 : return true;
965 : /* We often end up with an aggregate copy to the result decl,
966 : handle that case as well. First skip intermediate clobbers
967 : though. */
968 : gimple *def = ret;
969 1666124 : do
970 : {
971 3332248 : def = SSA_NAME_DEF_STMT (gimple_vuse (def));
972 : }
973 1666124 : while (gimple_clobber_p (def));
974 541110 : if (is_a <gassign *> (def)
975 61468 : && gimple_assign_lhs (def) == gimple_return_retval (ret)
976 548166 : && gimple_assign_rhs1 (def) == decl)
977 : return true;
978 : }
979 : return false;
980 : }
981 :
982 : /* Calculate cost of DR's memory access. */
983 : void
984 1010004 : vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, slp_tree slp_node,
985 : int ncopies, dr_alignment_support alignment_support_scheme,
986 : int misalignment,
987 : unsigned int *inside_cost,
988 : stmt_vector_for_cost *body_cost_vec)
989 : {
990 1010004 : tree vectype
991 1010004 : = slp_node ? SLP_TREE_VECTYPE (slp_node) : STMT_VINFO_VECTYPE (stmt_info);
992 1010004 : switch (alignment_support_scheme)
993 : {
994 552072 : case dr_aligned:
995 552072 : {
996 552072 : *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
997 : vector_store, stmt_info, slp_node,
998 : vectype, 0, vect_body);
999 :
1000 552072 : if (dump_enabled_p ())
1001 14515 : dump_printf_loc (MSG_NOTE, vect_location,
1002 : "vect_model_store_cost: aligned.\n");
1003 : break;
1004 : }
1005 :
1006 457932 : case dr_unaligned_supported:
1007 457932 : {
1008 : /* Here, we assign an additional cost for the unaligned store. */
1009 457932 : *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
1010 : unaligned_store, stmt_info, slp_node,
1011 : vectype, misalignment, vect_body);
1012 457932 : if (dump_enabled_p ())
1013 12920 : dump_printf_loc (MSG_NOTE, vect_location,
1014 : "vect_model_store_cost: unaligned supported by "
1015 : "hardware.\n");
1016 : break;
1017 : }
1018 :
1019 0 : case dr_unaligned_unsupported:
1020 0 : {
1021 0 : *inside_cost = VECT_MAX_COST;
1022 :
1023 0 : if (dump_enabled_p ())
1024 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1025 : "vect_model_store_cost: unsupported access.\n");
1026 : break;
1027 : }
1028 :
1029 0 : default:
1030 0 : gcc_unreachable ();
1031 : }
1032 1010004 : }
1033 :
1034 : /* Calculate cost of DR's memory access. */
1035 : void
1036 923672 : vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, slp_tree slp_node,
1037 : int ncopies, dr_alignment_support alignment_support_scheme,
1038 : int misalignment,
1039 : bool add_realign_cost, unsigned int *inside_cost,
1040 : unsigned int *prologue_cost,
1041 : stmt_vector_for_cost *prologue_cost_vec,
1042 : stmt_vector_for_cost *body_cost_vec,
1043 : bool record_prologue_costs)
1044 : {
1045 923672 : tree vectype
1046 923672 : = slp_node ? SLP_TREE_VECTYPE (slp_node) : STMT_VINFO_VECTYPE (stmt_info);
1047 923672 : switch (alignment_support_scheme)
1048 : {
1049 525419 : case dr_aligned:
1050 525419 : {
1051 525419 : *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load,
1052 : stmt_info, slp_node, vectype,
1053 : 0, vect_body);
1054 :
1055 525419 : if (dump_enabled_p ())
1056 18859 : dump_printf_loc (MSG_NOTE, vect_location,
1057 : "vect_model_load_cost: aligned.\n");
1058 :
1059 : break;
1060 : }
1061 342294 : case dr_unaligned_supported:
1062 342294 : {
1063 : /* Here, we assign an additional cost for the unaligned load. */
1064 342294 : *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
1065 : unaligned_load, stmt_info, slp_node,
1066 : vectype, misalignment, vect_body);
1067 :
1068 342294 : if (dump_enabled_p ())
1069 22307 : dump_printf_loc (MSG_NOTE, vect_location,
1070 : "vect_model_load_cost: unaligned supported by "
1071 : "hardware.\n");
1072 :
1073 : break;
1074 : }
1075 0 : case dr_explicit_realign:
1076 0 : {
1077 0 : *inside_cost += record_stmt_cost (body_cost_vec, ncopies * 2,
1078 : vector_load, stmt_info, slp_node,
1079 : vectype, 0, vect_body);
1080 0 : *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
1081 : vec_perm, stmt_info, slp_node,
1082 : vectype, 0, vect_body);
1083 :
1084 : /* FIXME: If the misalignment remains fixed across the iterations of
1085 : the containing loop, the following cost should be added to the
1086 : prologue costs. */
1087 0 : if (targetm.vectorize.builtin_mask_for_load)
1088 0 : *inside_cost += record_stmt_cost (body_cost_vec, 1, vector_stmt,
1089 : stmt_info, slp_node, vectype,
1090 : 0, vect_body);
1091 :
1092 0 : if (dump_enabled_p ())
1093 0 : dump_printf_loc (MSG_NOTE, vect_location,
1094 : "vect_model_load_cost: explicit realign\n");
1095 :
1096 : break;
1097 : }
1098 0 : case dr_explicit_realign_optimized:
1099 0 : {
1100 0 : if (dump_enabled_p ())
1101 0 : dump_printf_loc (MSG_NOTE, vect_location,
1102 : "vect_model_load_cost: unaligned software "
1103 : "pipelined.\n");
1104 :
1105 : /* Unaligned software pipeline has a load of an address, an initial
1106 : load, and possibly a mask operation to "prime" the loop. However,
1107 : if this is an access in a group of loads, which provide grouped
1108 : access, then the above cost should only be considered for one
1109 : access in the group. Inside the loop, there is a load op
1110 : and a realignment op. */
1111 :
1112 0 : if (add_realign_cost && record_prologue_costs)
1113 : {
1114 0 : *prologue_cost += record_stmt_cost (prologue_cost_vec, 2,
1115 : vector_stmt, stmt_info,
1116 : slp_node, vectype,
1117 : 0, vect_prologue);
1118 0 : if (targetm.vectorize.builtin_mask_for_load)
1119 0 : *prologue_cost += record_stmt_cost (prologue_cost_vec, 1,
1120 : vector_stmt, stmt_info,
1121 : slp_node, vectype,
1122 : 0, vect_prologue);
1123 : }
1124 :
1125 0 : *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load,
1126 : stmt_info, slp_node, vectype,
1127 : 0, vect_body);
1128 0 : *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_perm,
1129 : stmt_info, slp_node, vectype,
1130 : 0, vect_body);
1131 :
1132 0 : if (dump_enabled_p ())
1133 0 : dump_printf_loc (MSG_NOTE, vect_location,
1134 : "vect_model_load_cost: explicit realign optimized"
1135 : "\n");
1136 :
1137 : break;
1138 : }
1139 :
1140 55959 : case dr_unaligned_unsupported:
1141 55959 : {
1142 55959 : *inside_cost = VECT_MAX_COST;
1143 :
1144 55959 : if (dump_enabled_p ())
1145 104 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1146 : "vect_model_load_cost: unsupported access.\n");
1147 : break;
1148 : }
1149 :
1150 0 : default:
1151 0 : gcc_unreachable ();
1152 : }
1153 923672 : }
1154 :
1155 : /* Insert the new stmt NEW_STMT at *GSI or at the appropriate place in
1156 : the loop preheader for the vectorized stmt STMT_VINFO. */
1157 :
1158 : static void
1159 6645 : vect_init_vector_1 (vec_info *vinfo, stmt_vec_info stmt_vinfo, gimple *new_stmt,
1160 : gimple_stmt_iterator *gsi)
1161 : {
1162 6645 : if (gsi)
1163 3346 : vect_finish_stmt_generation (vinfo, stmt_vinfo, new_stmt, gsi);
1164 : else
1165 3299 : vinfo->insert_on_entry (stmt_vinfo, new_stmt);
1166 :
1167 6645 : if (dump_enabled_p ())
1168 1815 : dump_printf_loc (MSG_NOTE, vect_location,
1169 : "created new init_stmt: %G", new_stmt);
1170 6645 : }
1171 :
1172 : /* Function vect_init_vector.
1173 :
1174 : Insert a new stmt (INIT_STMT) that initializes a new variable of type
1175 : TYPE with the value VAL. If TYPE is a vector type and VAL does not have
1176 : vector type a vector with all elements equal to VAL is created first.
1177 : Place the initialization at GSI if it is not NULL. Otherwise, place the
1178 : initialization at the loop preheader.
1179 : Return the DEF of INIT_STMT.
1180 : It will be used in the vectorization of STMT_INFO. */
1181 :
1182 : tree
1183 4926 : vect_init_vector (vec_info *vinfo, stmt_vec_info stmt_info, tree val, tree type,
1184 : gimple_stmt_iterator *gsi)
1185 : {
1186 4926 : gimple *init_stmt;
1187 4926 : tree new_temp;
1188 :
1189 : /* We abuse this function to push sth to a SSA name with initial 'val'. */
1190 4926 : if (! useless_type_conversion_p (type, TREE_TYPE (val)))
1191 : {
1192 1335 : gcc_assert (VECTOR_TYPE_P (type));
1193 1335 : if (! types_compatible_p (TREE_TYPE (type), TREE_TYPE (val)))
1194 : {
1195 : /* Scalar boolean value should be transformed into
1196 : all zeros or all ones value before building a vector. */
1197 11 : if (VECTOR_BOOLEAN_TYPE_P (type))
1198 : {
1199 3 : tree true_val = build_all_ones_cst (TREE_TYPE (type));
1200 3 : tree false_val = build_zero_cst (TREE_TYPE (type));
1201 :
1202 3 : if (CONSTANT_CLASS_P (val))
1203 0 : val = integer_zerop (val) ? false_val : true_val;
1204 : else
1205 : {
1206 3 : new_temp = make_ssa_name (TREE_TYPE (type));
1207 3 : init_stmt = gimple_build_assign (new_temp, COND_EXPR,
1208 : val, true_val, false_val);
1209 3 : vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
1210 3 : val = new_temp;
1211 : }
1212 : }
1213 : else
1214 : {
1215 8 : gimple_seq stmts = NULL;
1216 8 : if (! INTEGRAL_TYPE_P (TREE_TYPE (val)))
1217 8 : val = gimple_build (&stmts, VIEW_CONVERT_EXPR,
1218 8 : TREE_TYPE (type), val);
1219 : else
1220 : /* ??? Condition vectorization expects us to do
1221 : promotion of invariant/external defs. */
1222 0 : val = gimple_convert (&stmts, TREE_TYPE (type), val);
1223 16 : for (gimple_stmt_iterator gsi2 = gsi_start (stmts);
1224 16 : !gsi_end_p (gsi2); )
1225 : {
1226 8 : init_stmt = gsi_stmt (gsi2);
1227 8 : gsi_remove (&gsi2, false);
1228 8 : vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
1229 : }
1230 : }
1231 : }
1232 1335 : val = build_vector_from_val (type, val);
1233 : }
1234 :
1235 4926 : new_temp = vect_get_new_ssa_name (type, vect_simple_var, "cst_");
1236 4926 : init_stmt = gimple_build_assign (new_temp, val);
1237 4926 : vect_init_vector_1 (vinfo, stmt_info, init_stmt, gsi);
1238 4926 : return new_temp;
1239 : }
1240 :
1241 :
1242 : /* Get vectorized definitions for OP0 and OP1. */
1243 :
1244 : void
1245 187560 : vect_get_vec_defs (vec_info *, slp_tree slp_node,
1246 : tree op0, vec<tree> *vec_oprnds0,
1247 : tree op1, vec<tree> *vec_oprnds1,
1248 : tree op2, vec<tree> *vec_oprnds2,
1249 : tree op3, vec<tree> *vec_oprnds3)
1250 : {
1251 187560 : if (op0)
1252 185910 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], vec_oprnds0);
1253 187560 : if (op1)
1254 138090 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[1], vec_oprnds1);
1255 187560 : if (op2)
1256 9265 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[2], vec_oprnds2);
1257 187560 : if (op3)
1258 0 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[3], vec_oprnds3);
1259 187560 : }
1260 :
1261 : /* Helper function called by vect_finish_replace_stmt and
1262 : vect_finish_stmt_generation. Set the location of the new
1263 : statement and create and return a stmt_vec_info for it. */
1264 :
1265 : static void
1266 1435319 : vect_finish_stmt_generation_1 (vec_info *,
1267 : stmt_vec_info stmt_info, gimple *vec_stmt)
1268 : {
1269 1435319 : if (dump_enabled_p ())
1270 148123 : dump_printf_loc (MSG_NOTE, vect_location, "add new stmt: %G", vec_stmt);
1271 :
1272 1435319 : if (stmt_info)
1273 : {
1274 1404106 : gimple_set_location (vec_stmt, gimple_location (stmt_info->stmt));
1275 :
1276 : /* While EH edges will generally prevent vectorization, stmt might
1277 : e.g. be in a must-not-throw region. Ensure newly created stmts
1278 : that could throw are part of the same region. */
1279 1404106 : int lp_nr = lookup_stmt_eh_lp (stmt_info->stmt);
1280 1404106 : if (lp_nr != 0 && stmt_could_throw_p (cfun, vec_stmt))
1281 48 : add_stmt_to_eh_lp (vec_stmt, lp_nr);
1282 : }
1283 : else
1284 31213 : gcc_assert (!stmt_could_throw_p (cfun, vec_stmt));
1285 1435319 : }
1286 :
1287 : /* Replace the scalar statement STMT_INFO with a new vector statement VEC_STMT,
1288 : which sets the same scalar result as STMT_INFO did. Create and return a
1289 : stmt_vec_info for VEC_STMT. */
1290 :
1291 : void
1292 895 : vect_finish_replace_stmt (vec_info *vinfo,
1293 : stmt_vec_info stmt_info, gimple *vec_stmt)
1294 : {
1295 895 : gimple *scalar_stmt = vect_orig_stmt (stmt_info)->stmt;
1296 895 : gcc_assert (gimple_get_lhs (scalar_stmt) == gimple_get_lhs (vec_stmt));
1297 :
1298 895 : gimple_stmt_iterator gsi = gsi_for_stmt (scalar_stmt);
1299 895 : gsi_replace (&gsi, vec_stmt, true);
1300 :
1301 895 : vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt);
1302 895 : }
1303 :
1304 : /* Add VEC_STMT to the vectorized implementation of STMT_INFO and insert it
1305 : before *GSI. Create and return a stmt_vec_info for VEC_STMT. */
1306 :
1307 : void
1308 1434424 : vect_finish_stmt_generation (vec_info *vinfo,
1309 : stmt_vec_info stmt_info, gimple *vec_stmt,
1310 : gimple_stmt_iterator *gsi)
1311 : {
1312 1434424 : gcc_assert (!stmt_info || gimple_code (stmt_info->stmt) != GIMPLE_LABEL);
1313 :
1314 1434424 : if (!gsi_end_p (*gsi)
1315 2867575 : && gimple_has_mem_ops (vec_stmt))
1316 : {
1317 1433151 : gimple *at_stmt = gsi_stmt (*gsi);
1318 1433151 : tree vuse = gimple_vuse (at_stmt);
1319 1426723 : if (vuse && TREE_CODE (vuse) == SSA_NAME)
1320 : {
1321 1285298 : tree vdef = gimple_vdef (at_stmt);
1322 1285298 : gimple_set_vuse (vec_stmt, gimple_vuse (at_stmt));
1323 1285298 : gimple_set_modified (vec_stmt, true);
1324 : /* If we have an SSA vuse and insert a store, update virtual
1325 : SSA form to avoid triggering the renamer. Do so only
1326 : if we can easily see all uses - which is what almost always
1327 : happens with the way vectorized stmts are inserted. */
1328 752688 : if ((vdef && TREE_CODE (vdef) == SSA_NAME)
1329 2037950 : && ((is_gimple_assign (vec_stmt)
1330 751781 : && !is_gimple_reg (gimple_assign_lhs (vec_stmt)))
1331 64960 : || (is_gimple_call (vec_stmt)
1332 871 : && (!(gimple_call_flags (vec_stmt)
1333 871 : & (ECF_CONST|ECF_PURE|ECF_NOVOPS))
1334 1 : || (gimple_call_lhs (vec_stmt)
1335 1 : && !is_gimple_reg (gimple_call_lhs (vec_stmt)))))))
1336 : {
1337 688562 : tree new_vdef = copy_ssa_name (vuse, vec_stmt);
1338 688562 : gimple_set_vdef (vec_stmt, new_vdef);
1339 688562 : SET_USE (gimple_vuse_op (at_stmt), new_vdef);
1340 : }
1341 : }
1342 : }
1343 1434424 : gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
1344 1434424 : vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt);
1345 1434424 : }
1346 :
1347 : /* We want to vectorize a call to combined function CFN with function
1348 : decl FNDECL, using VECTYPE_OUT as the type of the output and VECTYPE_IN
1349 : as the types of all inputs. Check whether this is possible using
1350 : an internal function, returning its code if so or IFN_LAST if not. */
1351 :
1352 : static internal_fn
1353 16255 : vectorizable_internal_function (combined_fn cfn, tree fndecl,
1354 : tree vectype_out, tree vectype_in)
1355 : {
1356 16255 : internal_fn ifn;
1357 16255 : if (internal_fn_p (cfn))
1358 13813 : ifn = as_internal_fn (cfn);
1359 : else
1360 2442 : ifn = associated_internal_fn (fndecl);
1361 16255 : if (ifn != IFN_LAST && direct_internal_fn_p (ifn))
1362 : {
1363 12863 : const direct_internal_fn_info &info = direct_internal_fn (ifn);
1364 12863 : if (info.vectorizable)
1365 : {
1366 12863 : bool same_size_p = TYPE_SIZE (vectype_in) == TYPE_SIZE (vectype_out);
1367 12863 : tree type0 = (info.type0 < 0 ? vectype_out : vectype_in);
1368 12863 : tree type1 = (info.type1 < 0 ? vectype_out : vectype_in);
1369 :
1370 : /* The type size of both the vectype_in and vectype_out should be
1371 : exactly the same when vectype_out isn't participating the optab.
1372 : While there is no restriction for type size when vectype_out
1373 : is part of the optab query. */
1374 12863 : if (type0 != vectype_out && type1 != vectype_out && !same_size_p)
1375 : return IFN_LAST;
1376 :
1377 12843 : if (direct_internal_fn_supported_p (ifn, tree_pair (type0, type1),
1378 : OPTIMIZE_FOR_SPEED))
1379 : return ifn;
1380 : }
1381 : }
1382 : return IFN_LAST;
1383 : }
1384 :
1385 :
1386 : static tree permute_vec_elements (vec_info *, tree, tree, tree, stmt_vec_info,
1387 : gimple_stmt_iterator *);
1388 :
1389 : /* Check whether a load or store statement in the loop described by
1390 : LOOP_VINFO is possible in a loop using partial vectors. This is
1391 : testing whether the vectorizer pass has the appropriate support,
1392 : as well as whether the target does.
1393 :
1394 : VLS_TYPE says whether the statement is a load or store and VECTYPE
1395 : is the type of the vector being loaded or stored. SLP_NODE is the SLP
1396 : node that contains the statement, or null if none. MEMORY_ACCESS_TYPE
1397 : says how the load or store is going to be implemented and GROUP_SIZE
1398 : is the number of load or store statements in the containing group.
1399 : If the access is a gather load or scatter store, GS_INFO describes
1400 : its arguments. If the load or store is conditional, SCALAR_MASK is the
1401 : condition under which it occurs.
1402 :
1403 : Clear LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P if a loop using partial
1404 : vectors is not supported, otherwise record the required rgroup control
1405 : types.
1406 :
1407 : If partial vectors can be used and ELSVALS is nonzero the supported
1408 : else values will be added to the vector ELSVALS points to. */
1409 :
1410 : static void
1411 288822 : check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
1412 : slp_tree slp_node,
1413 : vec_load_store_type vls_type,
1414 : int group_size,
1415 : vect_load_store_data *ls,
1416 : slp_tree mask_node,
1417 : vec<int> *elsvals = nullptr)
1418 : {
1419 288822 : vect_memory_access_type memory_access_type = ls->memory_access_type;
1420 :
1421 : /* Invariant loads need no special support. */
1422 288822 : if (memory_access_type == VMAT_INVARIANT)
1423 29099 : return;
1424 :
1425 : /* Figure whether the mask is uniform. scalar_mask is used to
1426 : populate the scalar_cond_masked_set. */
1427 287617 : tree scalar_mask = NULL_TREE;
1428 287617 : if (mask_node)
1429 4968 : for (unsigned i = 0; i < SLP_TREE_LANES (mask_node); ++i)
1430 : {
1431 2535 : tree def = vect_get_slp_scalar_def (mask_node, i);
1432 2535 : if (!def
1433 2535 : || (scalar_mask && def != scalar_mask))
1434 : {
1435 : scalar_mask = NULL;
1436 : break;
1437 : }
1438 : else
1439 2504 : scalar_mask = def;
1440 : }
1441 :
1442 287617 : unsigned int nvectors = vect_get_num_copies (loop_vinfo, slp_node);
1443 287617 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1444 287617 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
1445 287617 : machine_mode vecmode = TYPE_MODE (vectype);
1446 287617 : bool is_load = (vls_type == VLS_LOAD);
1447 287617 : if (memory_access_type == VMAT_LOAD_STORE_LANES)
1448 : {
1449 0 : nvectors /= group_size;
1450 0 : internal_fn ifn
1451 0 : = (is_load ? vect_load_lanes_supported (vectype, group_size, true,
1452 : elsvals)
1453 0 : : vect_store_lanes_supported (vectype, group_size, true));
1454 0 : if (ifn == IFN_MASK_LEN_LOAD_LANES || ifn == IFN_MASK_LEN_STORE_LANES)
1455 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
1456 0 : else if (ifn == IFN_MASK_LOAD_LANES || ifn == IFN_MASK_STORE_LANES)
1457 0 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
1458 : scalar_mask);
1459 : else
1460 : {
1461 0 : if (dump_enabled_p ())
1462 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1463 : "can't operate on partial vectors because"
1464 : " the target doesn't have an appropriate"
1465 : " load/store-lanes instruction.\n");
1466 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1467 : }
1468 0 : return;
1469 : }
1470 :
1471 287617 : if (mat_gather_scatter_p (memory_access_type))
1472 : {
1473 1731 : internal_fn ifn = (is_load
1474 1731 : ? IFN_MASK_GATHER_LOAD
1475 : : IFN_MASK_SCATTER_STORE);
1476 419 : internal_fn len_ifn = (is_load
1477 : ? IFN_MASK_LEN_GATHER_LOAD
1478 : : IFN_MASK_LEN_SCATTER_STORE);
1479 1731 : stmt_vec_info repr = SLP_TREE_REPRESENTATIVE (slp_node);
1480 1731 : tree off_vectype = (STMT_VINFO_GATHER_SCATTER_P (repr)
1481 1731 : ? SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (slp_node)[0])
1482 1731 : : ls->strided_offset_vectype);
1483 1731 : tree memory_type = TREE_TYPE (DR_REF (STMT_VINFO_DR_INFO (repr)->dr));
1484 1731 : int scale = SLP_TREE_GS_SCALE (slp_node);
1485 :
1486 : /* The following "supported" checks just verify what we established in
1487 : get_load_store_type and don't try different offset types.
1488 : Therefore, off_vectype must be a supported offset type. In case
1489 : we chose a different one use this instead. */
1490 1731 : if (ls->supported_offset_vectype)
1491 0 : off_vectype = ls->supported_offset_vectype;
1492 : /* Same for scale. */
1493 1731 : if (ls->supported_scale)
1494 0 : scale = ls->supported_scale;
1495 :
1496 1731 : if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
1497 : memory_type,
1498 : off_vectype, scale,
1499 : elsvals))
1500 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
1501 1731 : else if (internal_gather_scatter_fn_supported_p (ifn, vectype,
1502 : memory_type,
1503 : off_vectype, scale,
1504 : elsvals)
1505 1731 : || memory_access_type == VMAT_GATHER_SCATTER_LEGACY)
1506 566 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
1507 : scalar_mask);
1508 : else
1509 : {
1510 1165 : if (dump_enabled_p ())
1511 24 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1512 : "can't operate on partial vectors because"
1513 : " the target doesn't have an appropriate"
1514 : " gather load or scatter store instruction.\n");
1515 1165 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1516 : }
1517 1731 : return;
1518 : }
1519 :
1520 285886 : if (memory_access_type != VMAT_CONTIGUOUS)
1521 : {
1522 : /* Element X of the data must come from iteration i * VF + X of the
1523 : scalar loop. We need more work to support other mappings. */
1524 26163 : if (dump_enabled_p ())
1525 730 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1526 : "can't operate on partial vectors because an"
1527 : " access isn't contiguous.\n");
1528 26163 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1529 26163 : return;
1530 : }
1531 :
1532 259723 : if (!VECTOR_MODE_P (vecmode))
1533 : {
1534 0 : if (dump_enabled_p ())
1535 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1536 : "can't operate on partial vectors when emulating"
1537 : " vector operations.\n");
1538 0 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1539 0 : return;
1540 : }
1541 :
1542 : /* We might load more scalars than we need for permuting SLP loads.
1543 : We checked in get_load_store_type that the extra elements
1544 : don't leak into a new vector. */
1545 349394 : auto group_memory_nvectors = [](poly_uint64 size, poly_uint64 nunits)
1546 : {
1547 89671 : unsigned int nvectors;
1548 179342 : if (can_div_away_from_zero_p (size, nunits, &nvectors))
1549 89671 : return nvectors;
1550 : gcc_unreachable ();
1551 : };
1552 :
1553 259723 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1554 259723 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1555 259723 : machine_mode mask_mode;
1556 259723 : machine_mode vmode;
1557 259723 : bool using_partial_vectors_p = false;
1558 259723 : if (get_len_load_store_mode
1559 259723 : (vecmode, is_load, nullptr, elsvals).exists (&vmode))
1560 : {
1561 0 : nvectors = group_memory_nvectors (group_size * vf, nunits);
1562 0 : unsigned factor = (vecmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vecmode);
1563 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, factor);
1564 0 : using_partial_vectors_p = true;
1565 : }
1566 349394 : else if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
1567 259723 : && can_vec_mask_load_store_p (vecmode, mask_mode, is_load, NULL,
1568 : elsvals))
1569 : {
1570 89671 : nvectors = group_memory_nvectors (group_size * vf, nunits);
1571 89671 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
1572 89671 : using_partial_vectors_p = true;
1573 : }
1574 :
1575 89671 : if (!using_partial_vectors_p)
1576 : {
1577 170052 : if (dump_enabled_p ())
1578 11647 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579 : "can't operate on partial vectors because the"
1580 : " target doesn't have the appropriate partial"
1581 : " vectorization load or store.\n");
1582 170052 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
1583 : }
1584 : }
1585 :
1586 : /* Return the mask input to a masked load or store. VEC_MASK is the vectorized
1587 : form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask
1588 : that needs to be applied to all loads and stores in a vectorized loop.
1589 : Return VEC_MASK if LOOP_MASK is null or if VEC_MASK is already masked,
1590 : otherwise return VEC_MASK & LOOP_MASK.
1591 :
1592 : MASK_TYPE is the type of both masks. If new statements are needed,
1593 : insert them before GSI. */
1594 :
1595 : tree
1596 1701 : prepare_vec_mask (loop_vec_info loop_vinfo, tree mask_type, tree loop_mask,
1597 : tree vec_mask, gimple_stmt_iterator *gsi)
1598 : {
1599 1701 : gcc_assert (useless_type_conversion_p (mask_type, TREE_TYPE (vec_mask)));
1600 1701 : if (!loop_mask)
1601 : return vec_mask;
1602 :
1603 139 : gcc_assert (TREE_TYPE (loop_mask) == mask_type);
1604 :
1605 139 : if (loop_vinfo->vec_cond_masked_set.contains ({ vec_mask, loop_mask }))
1606 : return vec_mask;
1607 :
1608 139 : tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
1609 139 : gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
1610 : vec_mask, loop_mask);
1611 :
1612 139 : gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT);
1613 139 : return and_res;
1614 : }
1615 :
1616 : /* Determine whether we can use a gather load or scatter store to vectorize
1617 : strided load or store STMT_INFO by truncating the current offset to a
1618 : smaller width. We need to be able to construct an offset vector:
1619 :
1620 : { 0, X, X*2, X*3, ... }
1621 :
1622 : without loss of precision, where X is STMT_INFO's DR_STEP.
1623 :
1624 : Return true if this is possible, describing the gather load or scatter
1625 : store in GS_INFO. MASKED_P is true if the load or store is conditional.
1626 :
1627 : If we can use gather/scatter and ELSVALS is nonzero the supported
1628 : else values will be stored in the vector ELSVALS points to. */
1629 :
1630 : static bool
1631 63792 : vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info, tree vectype,
1632 : loop_vec_info loop_vinfo, bool masked_p,
1633 : gather_scatter_info *gs_info,
1634 : vec<int> *elsvals)
1635 : {
1636 63792 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
1637 63792 : data_reference *dr = dr_info->dr;
1638 63792 : tree step = DR_STEP (dr);
1639 63792 : if (TREE_CODE (step) != INTEGER_CST)
1640 : {
1641 : /* ??? Perhaps we could use range information here? */
1642 28604 : if (dump_enabled_p ())
1643 229 : dump_printf_loc (MSG_NOTE, vect_location,
1644 : "cannot truncate variable step.\n");
1645 28604 : return false;
1646 : }
1647 :
1648 : /* Get the number of bits in an element. */
1649 35188 : scalar_mode element_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
1650 35188 : unsigned int element_bits = GET_MODE_BITSIZE (element_mode);
1651 :
1652 : /* Set COUNT to the upper limit on the number of elements - 1.
1653 : Start with the maximum vectorization factor. */
1654 35188 : unsigned HOST_WIDE_INT count = vect_max_vf (loop_vinfo) - 1;
1655 :
1656 : /* Try lowering COUNT to the number of scalar latch iterations. */
1657 35188 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1658 35188 : widest_int max_iters;
1659 35188 : if (max_loop_iterations (loop, &max_iters)
1660 69647 : && max_iters < count)
1661 2069 : count = max_iters.to_shwi ();
1662 :
1663 : /* Try scales of 1 and the element size. */
1664 35188 : unsigned int scales[] = { 1, vect_get_scalar_dr_size (dr_info) };
1665 35188 : wi::overflow_type overflow = wi::OVF_NONE;
1666 105564 : for (int i = 0; i < 2; ++i)
1667 : {
1668 70376 : unsigned int scale = scales[i];
1669 70376 : widest_int factor;
1670 70376 : if (!wi::multiple_of_p (wi::to_widest (step), scale, SIGNED, &factor))
1671 0 : continue;
1672 :
1673 : /* Determine the minimum precision of (COUNT - 1) * STEP / SCALE. */
1674 70376 : widest_int range = wi::mul (count, factor, SIGNED, &overflow);
1675 70376 : if (overflow)
1676 0 : continue;
1677 70376 : signop sign = range >= 0 ? UNSIGNED : SIGNED;
1678 70376 : unsigned int min_offset_bits = wi::min_precision (range, sign);
1679 :
1680 : /* Find the narrowest viable offset type. */
1681 70376 : unsigned int offset_bits = 1U << ceil_log2 (min_offset_bits);
1682 70376 : tree offset_type = build_nonstandard_integer_type (offset_bits,
1683 : sign == UNSIGNED);
1684 :
1685 : /* See whether the target supports the operation with an offset
1686 : no narrower than OFFSET_TYPE. */
1687 70376 : tree memory_type = TREE_TYPE (DR_REF (dr));
1688 70376 : tree tmp_offset_vectype;
1689 70376 : int tmp_scale;
1690 70376 : if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
1691 : vectype, memory_type, offset_type,
1692 : scale, &tmp_scale,
1693 : &gs_info->ifn, &gs_info->offset_vectype,
1694 : &tmp_offset_vectype, elsvals)
1695 70376 : || gs_info->ifn == IFN_LAST)
1696 70376 : continue;
1697 :
1698 0 : gs_info->decl = NULL_TREE;
1699 : /* Logically the sum of DR_BASE_ADDRESS, DR_INIT and DR_OFFSET,
1700 : but we don't need to store that here. */
1701 0 : gs_info->base = NULL_TREE;
1702 0 : gs_info->alias_ptr = build_int_cst
1703 0 : (reference_alias_ptr_type (DR_REF (dr)),
1704 0 : get_object_alignment (DR_REF (dr)));
1705 0 : gs_info->element_type = TREE_TYPE (vectype);
1706 0 : gs_info->offset = fold_convert (offset_type, step);
1707 0 : gs_info->scale = scale;
1708 0 : gs_info->memory_type = memory_type;
1709 0 : return true;
1710 140752 : }
1711 :
1712 35188 : if (overflow && dump_enabled_p ())
1713 0 : dump_printf_loc (MSG_NOTE, vect_location,
1714 : "truncating gather/scatter offset to %d bits"
1715 : " might change its value.\n", element_bits);
1716 :
1717 : return false;
1718 35188 : }
1719 :
1720 : /* Return true if we can use gather/scatter or strided internal functions
1721 : to vectorize STMT_INFO, which is a grouped or strided load or store
1722 : with multiple lanes and will be implemented by a type-punned access
1723 : of a vector with element size that matches the number of lanes.
1724 :
1725 : MASKED_P is true if load or store is conditional.
1726 : When returning true, fill in GS_INFO with the information required to
1727 : perform the operation. Also, store the punning type in PUNNED_VECTYPE.
1728 :
1729 : If successful and ELSVALS is nonzero the supported
1730 : else values will be stored in the vector ELSVALS points to. */
1731 :
1732 : static bool
1733 4611 : vect_use_grouped_gather (dr_vec_info *dr_info, tree vectype,
1734 : loop_vec_info loop_vinfo, bool masked_p,
1735 : unsigned int nelts,
1736 : gather_scatter_info *info, vec<int> *elsvals,
1737 : tree *pun_vectype)
1738 : {
1739 4611 : data_reference *dr = dr_info->dr;
1740 :
1741 : /* TODO: We can support nelts > BITS_PER_UNIT or non-power-of-two by
1742 : multiple gathers/scatter. */
1743 8895 : if (nelts > BITS_PER_UNIT || !pow2p_hwi (nelts))
1744 : return false;
1745 :
1746 : /* Pun the vectype with one of the same size but an element spanning
1747 : NELTS elements of VECTYPE.
1748 : The punned type of a V16QI with NELTS = 4 would be V4SI.
1749 : */
1750 3973 : tree tmp;
1751 3973 : unsigned int pieces;
1752 3973 : if (!can_div_trunc_p (TYPE_VECTOR_SUBPARTS (vectype), nelts, &pieces)
1753 3973 : || !pieces)
1754 352 : return false;
1755 :
1756 3621 : *pun_vectype = vector_vector_composition_type (vectype, pieces, &tmp, true);
1757 :
1758 3621 : if (!*pun_vectype || !VECTOR_TYPE_P (*pun_vectype))
1759 : return false;
1760 :
1761 3245 : internal_fn ifn;
1762 3245 : tree offset_vectype = *pun_vectype;
1763 :
1764 2171 : internal_fn strided_ifn = DR_IS_READ (dr)
1765 3245 : ? IFN_MASK_LEN_STRIDED_LOAD : IFN_MASK_LEN_STRIDED_STORE;
1766 :
1767 : /* Check if we have a gather/scatter with the new type. We're just trying
1768 : with the type itself as offset for now. If not, check if we have a
1769 : strided load/store. These have fewer constraints (for example no offset
1770 : type must exist) so it is possible that even though a gather/scatter is
1771 : not available we still have a strided load/store. */
1772 3245 : bool ok = false;
1773 3245 : tree tmp_vectype;
1774 3245 : int tmp_scale;
1775 3245 : if (vect_gather_scatter_fn_p
1776 3245 : (loop_vinfo, DR_IS_READ (dr), masked_p, *pun_vectype,
1777 3245 : TREE_TYPE (*pun_vectype), *pun_vectype, 1, &tmp_scale, &ifn,
1778 : &offset_vectype, &tmp_vectype, elsvals))
1779 : ok = true;
1780 3245 : else if (internal_strided_fn_supported_p (strided_ifn, *pun_vectype,
1781 : elsvals))
1782 : {
1783 : /* Use gather/scatter IFNs, vect_get_strided_load_store_ops
1784 : will switch back to the strided variants. */
1785 0 : ifn = DR_IS_READ (dr) ? IFN_MASK_LEN_GATHER_LOAD :
1786 : IFN_MASK_LEN_SCATTER_STORE;
1787 0 : ok = true;
1788 : }
1789 :
1790 0 : if (ok)
1791 : {
1792 0 : info->ifn = ifn;
1793 0 : info->decl = NULL_TREE;
1794 0 : info->base = dr->ref;
1795 0 : info->alias_ptr = build_int_cst
1796 0 : (reference_alias_ptr_type (DR_REF (dr)),
1797 0 : get_object_alignment (DR_REF (dr)));
1798 0 : info->element_type = TREE_TYPE (*pun_vectype);
1799 0 : info->offset_vectype = offset_vectype;
1800 : /* No need to set the offset, vect_get_strided_load_store_ops
1801 : will do that. */
1802 0 : info->scale = 1;
1803 0 : info->memory_type = TREE_TYPE (DR_REF (dr));
1804 0 : return true;
1805 : }
1806 :
1807 : return false;
1808 : }
1809 :
1810 :
1811 : /* Return true if we can use gather/scatter internal functions to
1812 : vectorize STMT_INFO, which is a grouped or strided load or store.
1813 : MASKED_P is true if load or store is conditional. When returning
1814 : true, fill in GS_INFO with the information required to perform the
1815 : operation.
1816 :
1817 : If we can use gather/scatter and ELSVALS is nonzero the supported
1818 : else values will be stored in the vector ELSVALS points to. */
1819 :
1820 : static bool
1821 63792 : vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info, tree vectype,
1822 : loop_vec_info loop_vinfo, bool masked_p,
1823 : gather_scatter_info *gs_info,
1824 : vec<int> *elsvals,
1825 : unsigned int group_size,
1826 : bool single_element_p)
1827 : {
1828 63792 : if (!vect_check_gather_scatter (stmt_info, vectype,
1829 : loop_vinfo, gs_info, elsvals)
1830 63792 : || gs_info->ifn == IFN_LAST)
1831 : {
1832 63792 : if (!vect_truncate_gather_scatter_offset (stmt_info, vectype, loop_vinfo,
1833 : masked_p, gs_info, elsvals))
1834 : return false;
1835 : }
1836 :
1837 0 : if (!single_element_p
1838 0 : && !targetm.vectorize.prefer_gather_scatter (TYPE_MODE (vectype),
1839 : gs_info->scale,
1840 : group_size))
1841 : return false;
1842 :
1843 0 : if (dump_enabled_p ())
1844 0 : dump_printf_loc (MSG_NOTE, vect_location,
1845 : "using gather/scatter for strided/grouped access,"
1846 : " scale = %d\n", gs_info->scale);
1847 :
1848 : return true;
1849 : }
1850 :
1851 : /* STMT_INFO is a non-strided load or store, meaning that it accesses
1852 : elements with a known constant step. Return -1 if that step
1853 : is negative, 0 if it is zero, and 1 if it is greater than zero. */
1854 :
1855 : int
1856 1471104 : compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
1857 : {
1858 1471104 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
1859 1471104 : return tree_int_cst_compare (vect_dr_behavior (vinfo, dr_info)->step,
1860 1471104 : size_zero_node);
1861 : }
1862 :
1863 : /* If the target supports a permute mask that reverses the elements in
1864 : a vector of type VECTYPE, return that mask, otherwise return null. */
1865 :
1866 : tree
1867 9164 : perm_mask_for_reverse (tree vectype)
1868 : {
1869 9164 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
1870 :
1871 : /* The encoding has a single stepped pattern. */
1872 9164 : vec_perm_builder sel (nunits, 1, 3);
1873 36656 : for (int i = 0; i < 3; ++i)
1874 27492 : sel.quick_push (nunits - 1 - i);
1875 :
1876 9164 : vec_perm_indices indices (sel, 1, nunits);
1877 9164 : if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
1878 : indices))
1879 : return NULL_TREE;
1880 8012 : return vect_gen_perm_mask_checked (vectype, indices);
1881 9164 : }
1882 :
1883 : /* A subroutine of get_load_store_type, with a subset of the same
1884 : arguments. Handle the case where STMT_INFO is a load or store that
1885 : accesses consecutive elements with a negative step. Sets *POFFSET
1886 : to the offset to be applied to the DR for the first access. */
1887 :
1888 : static vect_memory_access_type
1889 12150 : get_negative_load_store_type (vec_info *vinfo,
1890 : stmt_vec_info stmt_info, tree vectype,
1891 : vec_load_store_type vls_type,
1892 : unsigned int ncopies, poly_int64 *poffset)
1893 : {
1894 12150 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
1895 12150 : dr_alignment_support alignment_support_scheme;
1896 :
1897 12150 : if (ncopies > 1)
1898 : {
1899 0 : if (dump_enabled_p ())
1900 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1901 : "multiple types with negative step.\n");
1902 0 : return VMAT_ELEMENTWISE;
1903 : }
1904 :
1905 : /* For backward running DRs the first access in vectype actually is
1906 : N-1 elements before the address of the DR. */
1907 12150 : *poffset = ((-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1908 12150 : * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1909 :
1910 12150 : int misalignment = dr_misalignment (dr_info, vectype, *poffset);
1911 12150 : alignment_support_scheme
1912 12150 : = vect_supportable_dr_alignment (vinfo, dr_info, vectype, misalignment);
1913 12150 : if (alignment_support_scheme != dr_aligned
1914 12150 : && alignment_support_scheme != dr_unaligned_supported)
1915 : {
1916 4374 : if (dump_enabled_p ())
1917 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1918 : "negative step but alignment required.\n");
1919 4374 : *poffset = 0;
1920 4374 : return VMAT_ELEMENTWISE;
1921 : }
1922 :
1923 7776 : if (vls_type == VLS_STORE_INVARIANT)
1924 : {
1925 1197 : if (dump_enabled_p ())
1926 21 : dump_printf_loc (MSG_NOTE, vect_location,
1927 : "negative step with invariant source;"
1928 : " no permute needed.\n");
1929 1197 : return VMAT_CONTIGUOUS_DOWN;
1930 : }
1931 :
1932 6579 : if (!perm_mask_for_reverse (vectype))
1933 : {
1934 1152 : if (dump_enabled_p ())
1935 52 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1936 : "negative step and reversing not supported.\n");
1937 1152 : *poffset = 0;
1938 1152 : return VMAT_ELEMENTWISE;
1939 : }
1940 :
1941 : return VMAT_CONTIGUOUS_REVERSE;
1942 : }
1943 :
1944 : /* STMT_INFO is either a masked or unconditional store. Return the value
1945 : being stored. */
1946 :
1947 : tree
1948 0 : vect_get_store_rhs (stmt_vec_info stmt_info)
1949 : {
1950 0 : if (gassign *assign = dyn_cast <gassign *> (stmt_info->stmt))
1951 : {
1952 0 : gcc_assert (gimple_assign_single_p (assign));
1953 0 : return gimple_assign_rhs1 (assign);
1954 : }
1955 0 : if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
1956 : {
1957 0 : internal_fn ifn = gimple_call_internal_fn (call);
1958 0 : int index = internal_fn_stored_value_index (ifn);
1959 0 : gcc_assert (index >= 0);
1960 0 : return gimple_call_arg (call, index);
1961 : }
1962 0 : gcc_unreachable ();
1963 : }
1964 :
1965 : /* Function VECTOR_VECTOR_COMPOSITION_TYPE
1966 :
1967 : This function returns a vector type which can be composed with NELTS pieces,
1968 : whose type is recorded in PTYPE. VTYPE should be a vector type, and has the
1969 : same vector size as the return vector. It checks target whether supports
1970 : pieces-size vector mode for construction firstly, if target fails to, check
1971 : pieces-size scalar mode for construction further. It returns NULL_TREE if
1972 : fails to find the available composition. If the caller only wants scalar
1973 : pieces where PTYPE e.g. is a possible gather/scatter element type
1974 : SCALAR_PTYPE_ONLY must be true.
1975 :
1976 : For example, for (vtype=V16QI, nelts=4), we can probably get:
1977 : - V16QI with PTYPE V4QI.
1978 : - V4SI with PTYPE SI.
1979 : - NULL_TREE. */
1980 :
1981 : static tree
1982 14203 : vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype,
1983 : bool scalar_ptype_only)
1984 : {
1985 14203 : gcc_assert (VECTOR_TYPE_P (vtype));
1986 14203 : gcc_assert (known_gt (nelts, 0U));
1987 :
1988 14203 : machine_mode vmode = TYPE_MODE (vtype);
1989 14203 : if (!VECTOR_MODE_P (vmode))
1990 : return NULL_TREE;
1991 :
1992 : /* When we are asked to compose the vector from its components let
1993 : that happen directly. */
1994 14203 : if (known_eq (TYPE_VECTOR_SUBPARTS (vtype), nelts))
1995 : {
1996 5990 : *ptype = TREE_TYPE (vtype);
1997 5990 : return vtype;
1998 : }
1999 :
2000 16426 : poly_uint64 vbsize = GET_MODE_BITSIZE (vmode);
2001 8213 : unsigned int pbsize;
2002 8213 : if (constant_multiple_p (vbsize, nelts, &pbsize))
2003 : {
2004 : /* First check if vec_init optab supports construction from
2005 : vector pieces directly. */
2006 8213 : scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vtype));
2007 16426 : poly_uint64 inelts = pbsize / GET_MODE_BITSIZE (elmode);
2008 8213 : machine_mode rmode;
2009 8213 : if (!scalar_ptype_only
2010 4592 : && related_vector_mode (vmode, elmode, inelts).exists (&rmode)
2011 12343 : && (convert_optab_handler (vec_init_optab, vmode, rmode)
2012 : != CODE_FOR_nothing))
2013 : {
2014 3490 : *ptype = build_vector_type (TREE_TYPE (vtype), inelts);
2015 3490 : return vtype;
2016 : }
2017 :
2018 : /* Otherwise check if exists an integer type of the same piece size and
2019 : if vec_init optab supports construction from it directly. */
2020 4723 : if (int_mode_for_size (pbsize, 0).exists (&elmode)
2021 4723 : && related_vector_mode (vmode, elmode, nelts).exists (&rmode))
2022 : {
2023 4309 : if (scalar_ptype_only
2024 4309 : || convert_optab_handler (vec_init_optab, rmode, elmode)
2025 : != CODE_FOR_nothing)
2026 : {
2027 4309 : *ptype = build_nonstandard_integer_type (pbsize, 1);
2028 4309 : return build_vector_type (*ptype, nelts);
2029 : }
2030 : }
2031 : }
2032 :
2033 : return NULL_TREE;
2034 : }
2035 :
2036 : /* Check if the load permutation of NODE only refers to a consecutive
2037 : subset of the group indices where GROUP_SIZE is the size of the
2038 : dataref's group. We also assert that the length of the permutation
2039 : divides the group size and is a power of two.
2040 : Such load permutations can be elided in strided access schemes as
2041 : we can "jump over" the gap they leave. */
2042 :
2043 : bool
2044 45044 : has_consecutive_load_permutation (slp_tree node, unsigned group_size)
2045 : {
2046 45044 : load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
2047 45044 : if (!perm.exists ()
2048 2164 : || perm.length () <= 1
2049 496 : || !pow2p_hwi (perm.length ())
2050 45524 : || group_size % perm.length ())
2051 : return false;
2052 :
2053 433 : return vect_load_perm_consecutive_p (node);
2054 : }
2055 :
2056 :
2057 : /* Analyze load or store SLP_NODE of type VLS_TYPE. Return true
2058 : if there is a memory access type that the vectorized form can use,
2059 : storing it in *MEMORY_ACCESS_TYPE if so. If we decide to use gathers
2060 : or scatters, fill in GS_INFO accordingly. In addition
2061 : *ALIGNMENT_SUPPORT_SCHEME is filled out and false is returned if
2062 : the target does not support the alignment scheme. *MISALIGNMENT
2063 : is set according to the alignment of the access (including
2064 : DR_MISALIGNMENT_UNKNOWN when it is unknown).
2065 :
2066 : MASKED_P is true if the statement is conditional on a vectorized mask.
2067 : VECTYPE is the vector type that the vectorized statements will use.
2068 :
2069 : If ELSVALS is nonzero the supported else values will be stored in the
2070 : vector ELSVALS points to. */
2071 :
2072 : static bool
2073 1356520 : get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
2074 : tree vectype, slp_tree slp_node,
2075 : bool masked_p, vec_load_store_type vls_type,
2076 : vect_load_store_data *ls)
2077 : {
2078 1356520 : vect_memory_access_type *memory_access_type = &ls->memory_access_type;
2079 1356520 : poly_int64 *poffset = &ls->poffset;
2080 1356520 : dr_alignment_support *alignment_support_scheme
2081 : = &ls->alignment_support_scheme;
2082 1356520 : int *misalignment = &ls->misalignment;
2083 1356520 : internal_fn *lanes_ifn = &ls->lanes_ifn;
2084 1356520 : vec<int> *elsvals = &ls->elsvals;
2085 1356520 : tree *ls_type = &ls->ls_type;
2086 1356520 : bool *slp_perm = &ls->slp_perm;
2087 1356520 : unsigned *n_perms = &ls->n_perms;
2088 1356520 : unsigned *n_loads = &ls->n_loads;
2089 1356520 : tree *supported_offset_vectype = &ls->supported_offset_vectype;
2090 1356520 : int *supported_scale = &ls->supported_scale;
2091 1356520 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
2092 1356520 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
2093 1356520 : class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
2094 1356520 : stmt_vec_info first_stmt_info;
2095 1356520 : unsigned int group_size;
2096 1356520 : unsigned HOST_WIDE_INT gap;
2097 1356520 : bool single_element_p;
2098 1356520 : poly_int64 neg_ldst_offset = 0;
2099 :
2100 1356520 : *misalignment = DR_MISALIGNMENT_UNKNOWN;
2101 1356520 : *poffset = 0;
2102 1356520 : *ls_type = NULL_TREE;
2103 1356520 : *slp_perm = false;
2104 1356520 : *n_perms = -1U;
2105 1356520 : *n_loads = -1U;
2106 1356520 : ls->subchain_p = false;
2107 :
2108 1356520 : bool perm_ok = true;
2109 1356520 : poly_int64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
2110 :
2111 1356520 : if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
2112 71522 : perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
2113 71522 : vf, true, n_perms, n_loads);
2114 :
2115 1356520 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2116 : {
2117 870031 : first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
2118 870031 : group_size = DR_GROUP_SIZE (first_stmt_info);
2119 870031 : gap = DR_GROUP_GAP (first_stmt_info);
2120 870031 : single_element_p = (stmt_info == first_stmt_info
2121 870031 : && !DR_GROUP_NEXT_ELEMENT (stmt_info));
2122 : }
2123 : else
2124 : {
2125 : first_stmt_info = stmt_info;
2126 : group_size = 1;
2127 : gap = 0;
2128 : single_element_p = true;
2129 : }
2130 1356520 : dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2131 :
2132 : /* True if the vectorized statements would access beyond the last
2133 : statement in the group. */
2134 1356520 : bool overrun_p = false;
2135 :
2136 : /* True if we can cope with such overrun by peeling for gaps, so that
2137 : there is at least one final scalar iteration after the vector loop. */
2138 2713040 : bool can_overrun_p = (!masked_p
2139 1356520 : && vls_type == VLS_LOAD
2140 538709 : && loop_vinfo
2141 1767877 : && !loop->inner);
2142 :
2143 : /* There can only be a gap at the end of the group if the stride is
2144 : known at compile time. */
2145 1356520 : gcc_assert (!STMT_VINFO_STRIDED_P (first_stmt_info) || gap == 0);
2146 :
2147 : /* For SLP vectorization we directly vectorize a subchain
2148 : without permutation. */
2149 1356520 : if (! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
2150 1284998 : first_dr_info = STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]);
2151 :
2152 1356520 : if (STMT_VINFO_STRIDED_P (first_stmt_info))
2153 : {
2154 : /* Try to use consecutive accesses of as many elements as possible,
2155 : separated by the stride, until we have a complete vector.
2156 : Fall back to scalar accesses if that isn't possible. */
2157 45044 : *memory_access_type = VMAT_STRIDED_SLP;
2158 :
2159 : /* If the load permutation is consecutive we can reduce the group to
2160 : the elements the permutation accesses. Then we release the
2161 : permutation. */
2162 45044 : if (has_consecutive_load_permutation (slp_node, group_size))
2163 : {
2164 32 : ls->subchain_p = true;
2165 32 : group_size = SLP_TREE_LANES (slp_node);
2166 32 : SLP_TREE_LOAD_PERMUTATION (slp_node).release ();
2167 : }
2168 : }
2169 1311476 : else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2170 : {
2171 10828 : slp_tree offset_node = SLP_TREE_CHILDREN (slp_node)[0];
2172 10828 : tree offset_vectype = SLP_TREE_VECTYPE (offset_node);
2173 10828 : int scale = SLP_TREE_GS_SCALE (slp_node);
2174 10828 : tree memory_type = TREE_TYPE (DR_REF (first_dr_info->dr));
2175 10828 : tree tem;
2176 10828 : if (vect_gather_scatter_fn_p (loop_vinfo, vls_type == VLS_LOAD,
2177 : masked_p, vectype, memory_type,
2178 : offset_vectype, scale, supported_scale,
2179 : &ls->gs.ifn, &tem,
2180 : supported_offset_vectype, elsvals))
2181 : {
2182 0 : if (dump_enabled_p ())
2183 : {
2184 0 : dump_printf_loc (MSG_NOTE, vect_location,
2185 : "gather/scatter with required "
2186 : "offset type "
2187 : "%T and offset scale %d.\n",
2188 : offset_vectype, scale);
2189 0 : if (*supported_offset_vectype)
2190 0 : dump_printf_loc (MSG_NOTE, vect_location,
2191 : " target supports offset type %T.\n",
2192 : *supported_offset_vectype);
2193 0 : if (*supported_scale)
2194 0 : dump_printf_loc (MSG_NOTE, vect_location,
2195 : " target supports offset scale %d.\n",
2196 : *supported_scale);
2197 : }
2198 0 : *memory_access_type = VMAT_GATHER_SCATTER_IFN;
2199 : }
2200 10828 : else if (vls_type == VLS_LOAD
2201 10828 : ? (targetm.vectorize.builtin_gather
2202 9235 : && (ls->gs.decl
2203 9235 : = targetm.vectorize.builtin_gather (vectype,
2204 9235 : TREE_TYPE
2205 : (offset_vectype),
2206 : scale)))
2207 1593 : : (targetm.vectorize.builtin_scatter
2208 1593 : && (ls->gs.decl
2209 1593 : = targetm.vectorize.builtin_scatter (vectype,
2210 1593 : TREE_TYPE
2211 : (offset_vectype),
2212 : scale))))
2213 574 : *memory_access_type = VMAT_GATHER_SCATTER_LEGACY;
2214 : else
2215 : {
2216 : /* GATHER_SCATTER_EMULATED_P. */
2217 10254 : if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()
2218 10254 : || !TYPE_VECTOR_SUBPARTS (offset_vectype).is_constant ()
2219 10254 : || VECTOR_BOOLEAN_TYPE_P (offset_vectype)
2220 10254 : || !constant_multiple_p (TYPE_VECTOR_SUBPARTS (offset_vectype),
2221 10254 : TYPE_VECTOR_SUBPARTS (vectype)))
2222 : {
2223 2732 : if (dump_enabled_p ())
2224 466 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2225 : "unsupported vector types for emulated "
2226 : "gather.\n");
2227 2732 : return false;
2228 : }
2229 7522 : *memory_access_type = VMAT_GATHER_SCATTER_EMULATED;
2230 : }
2231 : }
2232 : else
2233 : {
2234 1300648 : int cmp = compare_step_with_zero (vinfo, stmt_info);
2235 1300648 : if (cmp < 0)
2236 : {
2237 12328 : if (single_element_p)
2238 : /* ??? The VMAT_CONTIGUOUS_REVERSE code generation is
2239 : only correct for single element "interleaving" SLP. */
2240 12150 : *memory_access_type = get_negative_load_store_type
2241 12150 : (vinfo, stmt_info, vectype, vls_type, 1,
2242 : &neg_ldst_offset);
2243 : else
2244 : /* We can fall back to VMAT_STRIDED_SLP since that does
2245 : not care whether the stride between the group instances
2246 : is positive or negative. */
2247 178 : *memory_access_type = VMAT_STRIDED_SLP;
2248 : }
2249 1288320 : else if (cmp == 0 && loop_vinfo)
2250 : {
2251 3351 : gcc_assert (vls_type == VLS_LOAD);
2252 3351 : *memory_access_type = VMAT_INVARIANT;
2253 : }
2254 : /* Try using LOAD/STORE_LANES. */
2255 1284969 : else if (slp_node->ldst_lanes
2256 1284969 : && (*lanes_ifn
2257 0 : = (vls_type == VLS_LOAD
2258 0 : ? vect_load_lanes_supported (vectype, group_size,
2259 : masked_p, elsvals)
2260 0 : : vect_store_lanes_supported (vectype, group_size,
2261 : masked_p))) != IFN_LAST)
2262 0 : *memory_access_type = VMAT_LOAD_STORE_LANES;
2263 1284969 : else if (!loop_vinfo && slp_node->avoid_stlf_fail)
2264 : {
2265 70 : *memory_access_type = VMAT_ELEMENTWISE;
2266 70 : if (dump_enabled_p ())
2267 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2268 : "using element-wise load to avoid disrupting "
2269 : "cross iteration store-to-load forwarding\n");
2270 : }
2271 : else
2272 1284899 : *memory_access_type = VMAT_CONTIGUOUS;
2273 :
2274 : /* If this is single-element interleaving with an element
2275 : distance that leaves unused vector loads around fall back
2276 : to elementwise access if possible - we otherwise least
2277 : create very sub-optimal code in that case (and
2278 : blow up memory, see PR65518). */
2279 1300648 : if (loop_vinfo
2280 1300648 : && single_element_p
2281 467076 : && (*memory_access_type == VMAT_CONTIGUOUS
2282 15501 : || *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
2283 1767724 : && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
2284 : {
2285 17826 : *memory_access_type = VMAT_ELEMENTWISE;
2286 17826 : if (dump_enabled_p ())
2287 198 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2288 : "single-element interleaving not supported "
2289 : "for not adjacent vector loads, using "
2290 : "elementwise access\n");
2291 : }
2292 :
2293 : /* Also fall back to elementwise access in case we did not lower a
2294 : permutation and cannot code generate it. */
2295 1300648 : if (loop_vinfo
2296 521510 : && *memory_access_type != VMAT_ELEMENTWISE
2297 498158 : && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
2298 1329066 : && !perm_ok)
2299 : {
2300 2055 : *memory_access_type = VMAT_ELEMENTWISE;
2301 2055 : if (dump_enabled_p ())
2302 246 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303 : "permutation not supported, using elementwise "
2304 : "access\n");
2305 : }
2306 :
2307 521510 : overrun_p = (loop_vinfo && gap != 0
2308 1343507 : && *memory_access_type != VMAT_ELEMENTWISE);
2309 1300648 : if (overrun_p && vls_type != VLS_LOAD)
2310 : {
2311 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2312 : "Grouped store with gaps requires"
2313 : " non-consecutive accesses\n");
2314 9 : return false;
2315 : }
2316 :
2317 1300648 : unsigned HOST_WIDE_INT dr_size = vect_get_scalar_dr_size (first_dr_info);
2318 1300648 : poly_int64 off = 0;
2319 1300648 : if (*memory_access_type == VMAT_CONTIGUOUS_REVERSE)
2320 5268 : off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
2321 :
2322 : /* An overrun is fine if the trailing elements are smaller
2323 : than the alignment boundary B. Every vector access will
2324 : be a multiple of B and so we are guaranteed to access a
2325 : non-gap element in the same B-sized block. */
2326 1300648 : if (overrun_p
2327 1300648 : && gap < (vect_known_alignment_in_bytes (first_dr_info,
2328 22893 : vectype, off) / dr_size))
2329 : overrun_p = false;
2330 :
2331 : /* When we have a contiguous access across loop iterations
2332 : but the access in the loop doesn't cover the full vector
2333 : we can end up with no gap recorded but still excess
2334 : elements accessed, see PR103116. Make sure we peel for
2335 : gaps if necessary and sufficient and give up if not.
2336 :
2337 : If there is a combination of the access not covering the full
2338 : vector and a gap recorded then we may need to peel twice. */
2339 1300648 : bool large_vector_overrun_p = false;
2340 1300648 : if (loop_vinfo
2341 521510 : && (*memory_access_type == VMAT_CONTIGUOUS
2342 35396 : || *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
2343 491382 : && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
2344 1326634 : && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2345 : nunits))
2346 : large_vector_overrun_p = overrun_p = true;
2347 :
2348 : /* If the gap splits the vector in half and the target
2349 : can do half-vector operations avoid the epilogue peeling
2350 : by simply loading half of the vector only. Usually
2351 : the construction with an upper zero half will be elided. */
2352 1300648 : dr_alignment_support alss;
2353 1300648 : int misalign = dr_misalignment (first_dr_info, vectype, off);
2354 1300648 : tree half_vtype;
2355 1300648 : poly_uint64 remain;
2356 1300648 : unsigned HOST_WIDE_INT tem, num;
2357 1300648 : if (overrun_p
2358 1300648 : && !masked_p
2359 17422 : && *memory_access_type != VMAT_LOAD_STORE_LANES
2360 17422 : && (((alss = vect_supportable_dr_alignment (vinfo, first_dr_info,
2361 : vectype, misalign)))
2362 : == dr_aligned
2363 14946 : || alss == dr_unaligned_supported)
2364 9838 : && can_div_trunc_p (group_size
2365 9838 : * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - gap,
2366 : nunits, &tem, &remain)
2367 1310486 : && (known_eq (remain, 0u)
2368 7383 : || (known_ne (remain, 0u)
2369 5726 : && constant_multiple_p (nunits, remain, &num)
2370 1298193 : && (vector_vector_composition_type (vectype, num, &half_vtype)
2371 : != NULL_TREE))))
2372 8181 : overrun_p = false;
2373 :
2374 1300648 : if (overrun_p && !can_overrun_p)
2375 : {
2376 6 : if (dump_enabled_p ())
2377 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2378 : "Peeling for outer loop is not supported\n");
2379 6 : return false;
2380 : }
2381 :
2382 : /* Peeling for gaps assumes that a single scalar iteration
2383 : is enough to make sure the last vector iteration doesn't
2384 : access excess elements. */
2385 1300642 : if (overrun_p
2386 1300642 : && (!can_div_trunc_p (group_size
2387 9235 : * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - gap,
2388 : nunits, &tem, &remain)
2389 9235 : || maybe_lt (remain + group_size, nunits)))
2390 : {
2391 : /* But peeling a single scalar iteration is enough if
2392 : we can use the next power-of-two sized partial
2393 : access and that is sufficiently small to be covered
2394 : by the single scalar iteration. */
2395 16 : unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
2396 16 : if (masked_p
2397 16 : || !nunits.is_constant (&cnunits)
2398 16 : || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
2399 16 : || (((cremain = (group_size * cvf - gap) % cnunits), true)
2400 16 : && ((cpart_size = (1 << ceil_log2 (cremain))), true)
2401 16 : && (cremain + group_size < cpart_size
2402 13 : || (vector_vector_composition_type (vectype,
2403 13 : cnunits / cpart_size,
2404 : &half_vtype)
2405 : == NULL_TREE))))
2406 : {
2407 : /* If all fails we can still resort to niter masking unless
2408 : the vectors used are too big, so enforce the use of
2409 : partial vectors. */
2410 3 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2411 3 : && !large_vector_overrun_p)
2412 : {
2413 0 : if (dump_enabled_p ())
2414 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2415 : "peeling for gaps insufficient for "
2416 : "access unless using partial "
2417 : "vectors\n");
2418 0 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
2419 : }
2420 : else
2421 : {
2422 3 : if (dump_enabled_p ())
2423 3 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2424 : "peeling for gaps insufficient for "
2425 : "access\n");
2426 3 : return false;
2427 : }
2428 : }
2429 13 : else if (large_vector_overrun_p)
2430 : {
2431 13 : if (dump_enabled_p ())
2432 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2433 : "can't operate on partial vectors because "
2434 : "only unmasked loads handle access "
2435 : "shortening required because of gaps at "
2436 : "the end of the access\n");
2437 13 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2438 : }
2439 : }
2440 : }
2441 :
2442 : /* As a last resort, trying using a gather load or scatter store.
2443 :
2444 : ??? Although the code can handle all group sizes correctly,
2445 : it probably isn't a win to use separate strided accesses based
2446 : on nearby locations. Or, even if it's a win over scalar code,
2447 : it might not be a win over vectorizing at a lower VF, if that
2448 : allows us to use contiguous accesses. */
2449 1353779 : vect_memory_access_type grouped_gather_fallback = VMAT_UNINITIALIZED;
2450 1353779 : if (loop_vinfo
2451 574641 : && (*memory_access_type == VMAT_ELEMENTWISE
2452 574641 : || *memory_access_type == VMAT_STRIDED_SLP))
2453 : {
2454 70624 : gather_scatter_info gs_info;
2455 70624 : tree tem;
2456 70624 : if (SLP_TREE_LANES (slp_node) == 1
2457 65820 : && (!SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
2458 21600 : || single_element_p)
2459 134416 : && vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo,
2460 : masked_p, &gs_info, elsvals,
2461 : group_size, single_element_p))
2462 : {
2463 : /* vect_use_strided_gather_scatters_p does not save the actually
2464 : supported scale and offset type so do that here.
2465 : We need it later in check_load_store_for_partial_vectors
2466 : where we only check if the given internal function is supported
2467 : (to choose whether to use the IFN, LEGACY, or EMULATED flavor
2468 : of gather/scatter) and don't re-do the full analysis. */
2469 0 : tree tmp;
2470 0 : gcc_assert (vect_gather_scatter_fn_p
2471 : (loop_vinfo, vls_type == VLS_LOAD, masked_p, vectype,
2472 : gs_info.memory_type, TREE_TYPE (gs_info.offset),
2473 : gs_info.scale, supported_scale, &gs_info.ifn,
2474 : &tmp, supported_offset_vectype, elsvals));
2475 :
2476 0 : SLP_TREE_GS_SCALE (slp_node) = gs_info.scale;
2477 0 : SLP_TREE_GS_BASE (slp_node) = error_mark_node;
2478 0 : ls->gs.ifn = gs_info.ifn;
2479 0 : ls->strided_offset_vectype = gs_info.offset_vectype;
2480 0 : *memory_access_type = VMAT_GATHER_SCATTER_IFN;
2481 : }
2482 70624 : else if (SLP_TREE_LANES (slp_node) > 1
2483 : && !masked_p
2484 4804 : && !single_element_p
2485 75235 : && vect_use_grouped_gather (STMT_VINFO_DR_INFO (stmt_info),
2486 : vectype, loop_vinfo,
2487 : masked_p, group_size,
2488 : &gs_info, elsvals, &tem))
2489 : {
2490 0 : SLP_TREE_GS_SCALE (slp_node) = gs_info.scale;
2491 0 : SLP_TREE_GS_BASE (slp_node) = error_mark_node;
2492 0 : grouped_gather_fallback = *memory_access_type;
2493 0 : *memory_access_type = VMAT_GATHER_SCATTER_IFN;
2494 0 : ls->gs.ifn = gs_info.ifn;
2495 0 : vectype = *ls_type = tem;
2496 0 : ls->strided_offset_vectype = gs_info.offset_vectype;
2497 : }
2498 : }
2499 :
2500 1353779 : if (*memory_access_type == VMAT_CONTIGUOUS_DOWN
2501 1353779 : || *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
2502 6461 : *poffset = neg_ldst_offset;
2503 :
2504 1353779 : if (*memory_access_type == VMAT_ELEMENTWISE
2505 1328302 : || *memory_access_type == VMAT_GATHER_SCATTER_LEGACY
2506 1327728 : || *memory_access_type == VMAT_STRIDED_SLP
2507 1282511 : || *memory_access_type == VMAT_INVARIANT)
2508 : {
2509 74619 : *alignment_support_scheme = dr_unaligned_supported;
2510 74619 : *misalignment = DR_MISALIGNMENT_UNKNOWN;
2511 : }
2512 : else
2513 : {
2514 1279160 : if (mat_gather_scatter_p (*memory_access_type)
2515 : && !first_dr_info)
2516 : *misalignment = DR_MISALIGNMENT_UNKNOWN;
2517 : else
2518 1279160 : *misalignment = dr_misalignment (first_dr_info, vectype, *poffset);
2519 1279160 : *alignment_support_scheme
2520 1279160 : = vect_supportable_dr_alignment
2521 1279160 : (vinfo, first_dr_info, vectype, *misalignment,
2522 1279160 : mat_gather_scatter_p (*memory_access_type));
2523 1279160 : if (grouped_gather_fallback != VMAT_UNINITIALIZED
2524 0 : && *alignment_support_scheme != dr_aligned
2525 0 : && *alignment_support_scheme != dr_unaligned_supported)
2526 : {
2527 : /* No supportable alignment for a grouped gather, fall back to the
2528 : original memory access type. Even though VMAT_STRIDED_SLP might
2529 : also try aligned vector loads it can still choose vector
2530 : construction from scalars. */
2531 0 : *memory_access_type = grouped_gather_fallback;
2532 0 : *alignment_support_scheme = dr_unaligned_supported;
2533 0 : *misalignment = DR_MISALIGNMENT_UNKNOWN;
2534 : }
2535 : }
2536 :
2537 1353779 : if (overrun_p)
2538 : {
2539 9232 : gcc_assert (can_overrun_p);
2540 9232 : if (dump_enabled_p ())
2541 511 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2542 : "Data access with gaps requires scalar "
2543 : "epilogue loop\n");
2544 9232 : LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2545 : }
2546 :
2547 1353779 : if ((*memory_access_type == VMAT_ELEMENTWISE
2548 1353779 : || *memory_access_type == VMAT_STRIDED_SLP)
2549 : && !nunits.is_constant ())
2550 : {
2551 : if (dump_enabled_p ())
2552 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2553 : "Not using elementwise accesses due to variable "
2554 : "vectorization factor.\n");
2555 : return false;
2556 : }
2557 :
2558 : /* Checks if all scalar iterations are known to be inbounds. */
2559 1353779 : bool inbounds = DR_SCALAR_KNOWN_BOUNDS (STMT_VINFO_DR_INFO (stmt_info));
2560 :
2561 : /* Check if we support the operation if early breaks are needed. Here we
2562 : must ensure that we don't access any more than the scalar code would
2563 : have. A masked operation would ensure this, so for these load types
2564 : force masking. */
2565 1353779 : if (loop_vinfo
2566 574641 : && dr_safe_speculative_read_required (stmt_info)
2567 1532874 : && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
2568 : {
2569 179095 : if (mat_gather_scatter_p (*memory_access_type)
2570 179095 : || *memory_access_type == VMAT_STRIDED_SLP)
2571 : {
2572 9324 : if (dump_enabled_p ())
2573 8 : dump_printf_loc (MSG_NOTE, vect_location,
2574 : "early break not supported: cannot peel for "
2575 : "alignment. With non-contiguous memory vectorization"
2576 : " could read out of bounds at %G ",
2577 : STMT_VINFO_STMT (stmt_info));
2578 9324 : if (inbounds)
2579 0 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
2580 : else
2581 : return false;
2582 : }
2583 : /* Block-level alignment: Even though individual accesses of
2584 : VMAT_ELEMENTWISE type do not cause alignment problems, loading the
2585 : whole vector's worth of values in a speculative early-break context
2586 : might cross a page boundary. Set the alignment scheme to `dr_aligned'
2587 : here in order to force checking of whether such accesses meet
2588 : alignment criteria. */
2589 169771 : else if (*memory_access_type == VMAT_ELEMENTWISE && !inbounds)
2590 14873 : *alignment_support_scheme = dr_aligned;
2591 : }
2592 :
2593 : /* If this DR needs alignment for correctness, we must ensure the target
2594 : alignment is a constant power-of-two multiple of the amount read per
2595 : vector iteration or force masking. */
2596 1344455 : if (dr_safe_speculative_read_required (stmt_info)
2597 1344455 : && (*alignment_support_scheme == dr_aligned
2598 102678 : && !mat_gather_scatter_p (*memory_access_type)))
2599 : {
2600 : /* We can only peel for loops, of course. */
2601 102678 : gcc_checking_assert (loop_vinfo);
2602 :
2603 102678 : poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2604 102678 : poly_uint64 read_amount
2605 102678 : = vf * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
2606 102678 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2607 102678 : read_amount *= group_size;
2608 :
2609 102678 : auto target_alignment
2610 102678 : = DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info));
2611 102678 : if (!multiple_p (target_alignment, read_amount))
2612 : {
2613 12708 : if (dump_enabled_p ())
2614 : {
2615 28 : dump_printf_loc (MSG_NOTE, vect_location,
2616 : "desired alignment not met, target was ");
2617 28 : dump_dec (MSG_NOTE, target_alignment);
2618 28 : dump_printf (MSG_NOTE, " previously, but read amount is ");
2619 28 : dump_dec (MSG_NOTE, read_amount);
2620 28 : dump_printf (MSG_NOTE, " at %G.\n", STMT_VINFO_STMT (stmt_info));
2621 : }
2622 14897 : return false;
2623 : }
2624 :
2625 : /* When using a group access the first element may be aligned but the
2626 : subsequent loads may not be. For LOAD_LANES since the loads are based
2627 : on the first DR then all loads in the group are aligned. For
2628 : non-LOAD_LANES this is not the case. In particular a load + blend when
2629 : there are gaps can have the non first loads issued unaligned, even
2630 : partially overlapping the memory of the first load in order to simplify
2631 : the blend. This is what the x86_64 backend does for instance. As
2632 : such only the first load in the group is aligned, the rest are not.
2633 : Because of this the permutes may break the alignment requirements that
2634 : have been set, and as such we should for now, reject them. */
2635 89970 : if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
2636 : {
2637 2189 : if (dump_enabled_p ())
2638 75 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2639 : "loads with load permutations not supported for "
2640 : "speculative early break loads for %G",
2641 : STMT_VINFO_STMT (stmt_info));
2642 2189 : return false;
2643 : }
2644 :
2645 : /* Reject vectorization if we know the read mount per vector iteration
2646 : exceeds the min page size. */
2647 87781 : if (known_gt (read_amount, (unsigned) param_min_pagesize))
2648 : {
2649 0 : if (dump_enabled_p ())
2650 : {
2651 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2652 : "alignment required for correctness (");
2653 0 : dump_dec (MSG_MISSED_OPTIMIZATION, read_amount);
2654 0 : dump_printf (MSG_NOTE, ") may exceed page size.\n");
2655 : }
2656 0 : return false;
2657 : }
2658 :
2659 87781 : if (!vf.is_constant ())
2660 : {
2661 : /* For VLA modes, we need a runtime check to ensure any speculative
2662 : read amount does not exceed the page size. Here we record the max
2663 : possible read amount for the check. */
2664 : if (maybe_gt (read_amount,
2665 : LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo)))
2666 : LOOP_VINFO_MAX_SPEC_READ_AMOUNT (loop_vinfo) = read_amount;
2667 :
2668 : /* For VLA modes, we must use partial vectors. */
2669 : LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
2670 : }
2671 : }
2672 :
2673 1329558 : if (*alignment_support_scheme == dr_unaligned_unsupported)
2674 : {
2675 63763 : if (dump_enabled_p ())
2676 256 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2677 : "unsupported unaligned access\n");
2678 63763 : return false;
2679 : }
2680 :
2681 : /* FIXME: At the moment the cost model seems to underestimate the
2682 : cost of using elementwise accesses. This check preserves the
2683 : traditional behavior until that can be fixed. */
2684 1265795 : if (*memory_access_type == VMAT_ELEMENTWISE
2685 14744 : && !STMT_VINFO_STRIDED_P (first_stmt_info)
2686 1280539 : && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
2687 9655 : && single_element_p
2688 9012 : && !pow2p_hwi (group_size)))
2689 : {
2690 9068 : if (dump_enabled_p ())
2691 362 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2692 : "not falling back to elementwise accesses\n");
2693 9068 : return false;
2694 : }
2695 :
2696 : /* For BB vectorization build up the vector from existing scalar defs. */
2697 1256727 : if (!loop_vinfo && *memory_access_type == VMAT_ELEMENTWISE)
2698 : return false;
2699 :
2700 : /* Some loads need to explicitly permute the loaded data if there
2701 : is a load permutation. Among those are:
2702 : - VMAT_ELEMENTWISE.
2703 : - VMAT_STRIDED_SLP.
2704 : - VMAT_GATHER_SCATTER:
2705 : - Strided gather (fallback for VMAT_STRIDED_SLP if #lanes == 1).
2706 : - Grouped strided gather (ditto but for #lanes > 1).
2707 :
2708 : For VMAT_ELEMENTWISE we can fold the load permutation into the
2709 : individual indices we access directly, eliding the permutation.
2710 : Strided gather only allows load permutations for the
2711 : single-element case. */
2712 :
2713 1256727 : if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
2714 1256727 : && !(*memory_access_type == VMAT_ELEMENTWISE
2715 44604 : || (mat_gather_scatter_p (*memory_access_type)
2716 0 : && SLP_TREE_LANES (slp_node) == 1
2717 0 : && single_element_p)))
2718 : {
2719 44604 : if (!loop_vinfo)
2720 : {
2721 : /* In BB vectorization we may not actually use a loaded vector
2722 : accessing elements in excess of DR_GROUP_SIZE. */
2723 22792 : stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
2724 22792 : group_info = DR_GROUP_FIRST_ELEMENT (group_info);
2725 22792 : unsigned HOST_WIDE_INT nunits;
2726 22792 : unsigned j, k, maxk = 0;
2727 81736 : FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
2728 58944 : if (k > maxk)
2729 : maxk = k;
2730 22792 : tree vectype = SLP_TREE_VECTYPE (slp_node);
2731 41308 : if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
2732 22792 : || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
2733 : {
2734 4276 : if (dump_enabled_p ())
2735 29 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2736 : "BB vectorization with gaps at the end of "
2737 : "a load is not supported\n");
2738 4276 : return false;
2739 : }
2740 : }
2741 :
2742 40328 : if (!perm_ok)
2743 : {
2744 1988 : if (dump_enabled_p ())
2745 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
2746 : vect_location,
2747 : "unsupported load permutation\n");
2748 1988 : return false;
2749 : }
2750 :
2751 38340 : *slp_perm = true;
2752 : }
2753 :
2754 : return true;
2755 : }
2756 :
2757 : /* Return true if boolean argument at MASK_INDEX is suitable for vectorizing
2758 : conditional operation STMT_INFO. When returning true, store the mask
2759 : in *MASK_NODE, the type of its definition in *MASK_DT_OUT and the type of
2760 : the vectorized mask in *MASK_VECTYPE_OUT. */
2761 :
2762 : static bool
2763 12689 : vect_check_scalar_mask (vec_info *vinfo,
2764 : slp_tree slp_node, unsigned mask_index,
2765 : slp_tree *mask_node,
2766 : vect_def_type *mask_dt_out, tree *mask_vectype_out)
2767 : {
2768 12689 : enum vect_def_type mask_dt;
2769 12689 : tree mask_vectype;
2770 12689 : slp_tree mask_node_1;
2771 12689 : tree mask_;
2772 12689 : if (!vect_is_simple_use (vinfo, slp_node, mask_index,
2773 : &mask_, &mask_node_1, &mask_dt, &mask_vectype))
2774 : {
2775 0 : if (dump_enabled_p ())
2776 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2777 : "mask use not simple.\n");
2778 0 : return false;
2779 : }
2780 :
2781 12689 : if ((mask_dt == vect_constant_def || mask_dt == vect_external_def)
2782 12689 : && !VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (mask_)))
2783 : {
2784 0 : if (dump_enabled_p ())
2785 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2786 : "mask argument is not a boolean.\n");
2787 0 : return false;
2788 : }
2789 :
2790 12689 : tree vectype = SLP_TREE_VECTYPE (slp_node);
2791 12689 : if (!mask_vectype)
2792 19 : mask_vectype = get_mask_type_for_scalar_type (vinfo, TREE_TYPE (vectype),
2793 : mask_node_1);
2794 :
2795 12689 : if (!mask_vectype || !VECTOR_BOOLEAN_TYPE_P (mask_vectype))
2796 : {
2797 0 : if (dump_enabled_p ())
2798 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2799 : "could not find an appropriate vector mask type.\n");
2800 0 : return false;
2801 : }
2802 :
2803 12689 : if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_vectype),
2804 25378 : TYPE_VECTOR_SUBPARTS (vectype)))
2805 : {
2806 0 : if (dump_enabled_p ())
2807 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2808 : "vector mask type %T"
2809 : " does not match vector data type %T.\n",
2810 : mask_vectype, vectype);
2811 :
2812 0 : return false;
2813 : }
2814 :
2815 12689 : *mask_dt_out = mask_dt;
2816 12689 : *mask_vectype_out = mask_vectype;
2817 12689 : *mask_node = mask_node_1;
2818 12689 : return true;
2819 : }
2820 :
2821 :
2822 : /* Return true if stored value is suitable for vectorizing store
2823 : statement STMT_INFO. When returning true, store the scalar stored
2824 : in *RHS and *RHS_NODE, the type of the definition in *RHS_DT_OUT,
2825 : the type of the vectorized store value in
2826 : *RHS_VECTYPE_OUT and the type of the store in *VLS_TYPE_OUT. */
2827 :
2828 : static bool
2829 1361097 : vect_check_store_rhs (vec_info *vinfo, stmt_vec_info stmt_info,
2830 : slp_tree slp_node, slp_tree *rhs_node,
2831 : vect_def_type *rhs_dt_out, tree *rhs_vectype_out,
2832 : vec_load_store_type *vls_type_out)
2833 : {
2834 1361097 : int op_no = 0;
2835 1361097 : if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
2836 : {
2837 1899 : if (gimple_call_internal_p (call)
2838 1899 : && internal_store_fn_p (gimple_call_internal_fn (call)))
2839 1899 : op_no = internal_fn_stored_value_index (gimple_call_internal_fn (call));
2840 : }
2841 1361097 : op_no = vect_slp_child_index_for_operand (stmt_info, op_no);
2842 :
2843 1361097 : enum vect_def_type rhs_dt;
2844 1361097 : tree rhs_vectype;
2845 1361097 : tree rhs;
2846 1361097 : if (!vect_is_simple_use (vinfo, slp_node, op_no,
2847 : &rhs, rhs_node, &rhs_dt, &rhs_vectype))
2848 : {
2849 0 : if (dump_enabled_p ())
2850 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2851 : "use not simple.\n");
2852 0 : return false;
2853 : }
2854 :
2855 : /* In the case this is a store from a constant make sure
2856 : native_encode_expr can handle it. */
2857 1361097 : if (rhs_dt == vect_constant_def
2858 1361097 : && CONSTANT_CLASS_P (rhs) && native_encode_expr (rhs, NULL, 64) == 0)
2859 : {
2860 0 : if (dump_enabled_p ())
2861 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2862 : "cannot encode constant as a byte sequence.\n");
2863 0 : return false;
2864 : }
2865 :
2866 1361097 : tree vectype = SLP_TREE_VECTYPE (slp_node);
2867 1361097 : if (rhs_vectype && !useless_type_conversion_p (vectype, rhs_vectype))
2868 : {
2869 24 : if (dump_enabled_p ())
2870 24 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2871 : "incompatible vector types.\n");
2872 24 : return false;
2873 : }
2874 :
2875 1361073 : *rhs_dt_out = rhs_dt;
2876 1361073 : *rhs_vectype_out = rhs_vectype;
2877 1361073 : if (rhs_dt == vect_constant_def || rhs_dt == vect_external_def)
2878 1002757 : *vls_type_out = VLS_STORE_INVARIANT;
2879 : else
2880 358316 : *vls_type_out = VLS_STORE;
2881 : return true;
2882 : }
2883 :
2884 : /* Build an all-ones vector mask of type MASKTYPE while vectorizing STMT_INFO.
2885 : Note that we support masks with floating-point type, in which case the
2886 : floats are interpreted as a bitmask. */
2887 :
2888 : static tree
2889 170 : vect_build_all_ones_mask (vec_info *vinfo,
2890 : stmt_vec_info stmt_info, tree masktype)
2891 : {
2892 170 : if (TREE_CODE (masktype) == INTEGER_TYPE)
2893 98 : return build_int_cst (masktype, -1);
2894 72 : else if (VECTOR_BOOLEAN_TYPE_P (masktype)
2895 144 : || TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
2896 : {
2897 19 : tree mask = build_int_cst (TREE_TYPE (masktype), -1);
2898 19 : mask = build_vector_from_val (masktype, mask);
2899 19 : return vect_init_vector (vinfo, stmt_info, mask, masktype, NULL);
2900 : }
2901 53 : else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (masktype)))
2902 : {
2903 : REAL_VALUE_TYPE r;
2904 : long tmp[6];
2905 371 : for (int j = 0; j < 6; ++j)
2906 318 : tmp[j] = -1;
2907 53 : real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (masktype)));
2908 53 : tree mask = build_real (TREE_TYPE (masktype), r);
2909 53 : mask = build_vector_from_val (masktype, mask);
2910 53 : return vect_init_vector (vinfo, stmt_info, mask, masktype, NULL);
2911 : }
2912 0 : gcc_unreachable ();
2913 : }
2914 :
2915 : /* Build an all-zero merge value of type VECTYPE while vectorizing
2916 : STMT_INFO as a gather load. */
2917 :
2918 : static tree
2919 158 : vect_build_zero_merge_argument (vec_info *vinfo,
2920 : stmt_vec_info stmt_info, tree vectype)
2921 : {
2922 158 : tree merge;
2923 158 : if (TREE_CODE (TREE_TYPE (vectype)) == INTEGER_TYPE)
2924 49 : merge = build_int_cst (TREE_TYPE (vectype), 0);
2925 109 : else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (vectype)))
2926 : {
2927 : REAL_VALUE_TYPE r;
2928 : long tmp[6];
2929 763 : for (int j = 0; j < 6; ++j)
2930 654 : tmp[j] = 0;
2931 109 : real_from_target (&r, tmp, TYPE_MODE (TREE_TYPE (vectype)));
2932 109 : merge = build_real (TREE_TYPE (vectype), r);
2933 : }
2934 : else
2935 0 : gcc_unreachable ();
2936 158 : merge = build_vector_from_val (vectype, merge);
2937 158 : return vect_init_vector (vinfo, stmt_info, merge, vectype, NULL);
2938 : }
2939 :
2940 : /* Return the corresponding else value for an else value constant
2941 : ELSVAL with type TYPE. */
2942 :
2943 : tree
2944 1944 : vect_get_mask_load_else (int elsval, tree type)
2945 : {
2946 1944 : tree els;
2947 1944 : if (elsval == MASK_LOAD_ELSE_UNDEFINED)
2948 : {
2949 0 : tree tmp = create_tmp_var (type);
2950 : /* No need to warn about anything. */
2951 0 : TREE_NO_WARNING (tmp) = 1;
2952 0 : els = get_or_create_ssa_default_def (cfun, tmp);
2953 : }
2954 1944 : else if (elsval == MASK_LOAD_ELSE_M1)
2955 0 : els = build_minus_one_cst (type);
2956 1944 : else if (elsval == MASK_LOAD_ELSE_ZERO)
2957 1944 : els = build_zero_cst (type);
2958 : else
2959 0 : gcc_unreachable ();
2960 :
2961 1944 : return els;
2962 : }
2963 :
2964 : /* Build a gather load call while vectorizing STMT_INFO. Insert new
2965 : instructions before GSI and add them to VEC_STMT. GS_INFO describes
2966 : the gather load operation. If the load is conditional, MASK is the
2967 : vectorized condition, otherwise MASK is null. PTR is the base
2968 : pointer and OFFSET is the vectorized offset. */
2969 :
2970 : static gimple *
2971 346 : vect_build_one_gather_load_call (vec_info *vinfo, stmt_vec_info stmt_info,
2972 : slp_tree slp_node, tree vectype,
2973 : gimple_stmt_iterator *gsi, tree decl,
2974 : tree ptr, tree offset, tree mask)
2975 : {
2976 346 : tree arglist = TYPE_ARG_TYPES (TREE_TYPE (decl));
2977 346 : tree rettype = TREE_TYPE (TREE_TYPE (decl));
2978 346 : tree srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2979 346 : /* ptrtype */ arglist = TREE_CHAIN (arglist);
2980 346 : tree idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2981 346 : tree masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
2982 346 : tree scaletype = TREE_VALUE (arglist);
2983 346 : tree var;
2984 346 : gcc_checking_assert (types_compatible_p (srctype, rettype)
2985 : && (!mask
2986 : || TREE_CODE (masktype) == INTEGER_TYPE
2987 : || types_compatible_p (srctype, masktype)));
2988 :
2989 346 : tree op = offset;
2990 346 : if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
2991 : {
2992 100 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op)),
2993 : TYPE_VECTOR_SUBPARTS (idxtype)));
2994 100 : var = vect_get_new_ssa_name (idxtype, vect_simple_var);
2995 100 : op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
2996 100 : gassign *new_stmt = gimple_build_assign (var, VIEW_CONVERT_EXPR, op);
2997 100 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
2998 100 : op = var;
2999 : }
3000 :
3001 346 : tree src_op = NULL_TREE;
3002 346 : tree mask_op = NULL_TREE;
3003 346 : if (mask)
3004 : {
3005 188 : if (!useless_type_conversion_p (masktype, TREE_TYPE (mask)))
3006 : {
3007 188 : tree utype, optype = TREE_TYPE (mask);
3008 188 : if (VECTOR_TYPE_P (masktype)
3009 188 : || TYPE_MODE (masktype) == TYPE_MODE (optype))
3010 : utype = masktype;
3011 : else
3012 6 : utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1);
3013 188 : var = vect_get_new_ssa_name (utype, vect_scalar_var);
3014 188 : tree mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask);
3015 188 : gassign *new_stmt
3016 188 : = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg);
3017 188 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3018 188 : mask_arg = var;
3019 188 : if (!useless_type_conversion_p (masktype, utype))
3020 : {
3021 6 : gcc_assert (TYPE_PRECISION (utype)
3022 : <= TYPE_PRECISION (masktype));
3023 6 : var = vect_get_new_ssa_name (masktype, vect_scalar_var);
3024 6 : new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg);
3025 6 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3026 6 : mask_arg = var;
3027 : }
3028 188 : src_op = build_zero_cst (srctype);
3029 188 : mask_op = mask_arg;
3030 : }
3031 : else
3032 : {
3033 : src_op = mask;
3034 : mask_op = mask;
3035 : }
3036 : }
3037 : else
3038 : {
3039 158 : src_op = vect_build_zero_merge_argument (vinfo, stmt_info, rettype);
3040 158 : mask_op = vect_build_all_ones_mask (vinfo, stmt_info, masktype);
3041 : }
3042 :
3043 346 : tree scale = build_int_cst (scaletype, SLP_TREE_GS_SCALE (slp_node));
3044 346 : gimple *new_stmt = gimple_build_call (decl, 5, src_op, ptr, op,
3045 : mask_op, scale);
3046 :
3047 346 : if (!useless_type_conversion_p (vectype, rettype))
3048 : {
3049 49 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype),
3050 : TYPE_VECTOR_SUBPARTS (rettype)));
3051 49 : op = vect_get_new_ssa_name (rettype, vect_simple_var);
3052 49 : gimple_call_set_lhs (new_stmt, op);
3053 49 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3054 49 : op = build1 (VIEW_CONVERT_EXPR, vectype, op);
3055 49 : new_stmt = gimple_build_assign (NULL_TREE, VIEW_CONVERT_EXPR, op);
3056 : }
3057 :
3058 346 : return new_stmt;
3059 : }
3060 :
3061 : /* Build a scatter store call while vectorizing STMT_INFO. Insert new
3062 : instructions before GSI. GS_INFO describes the scatter store operation.
3063 : PTR is the base pointer, OFFSET the vectorized offsets and OPRND the
3064 : vectorized data to store.
3065 : If the store is conditional, MASK is the vectorized condition, otherwise
3066 : MASK is null. */
3067 :
3068 : static gimple *
3069 161 : vect_build_one_scatter_store_call (vec_info *vinfo, stmt_vec_info stmt_info,
3070 : slp_tree slp_node,
3071 : gimple_stmt_iterator *gsi,
3072 : tree decl,
3073 : tree ptr, tree offset, tree oprnd, tree mask)
3074 : {
3075 161 : tree rettype = TREE_TYPE (TREE_TYPE (decl));
3076 161 : tree arglist = TYPE_ARG_TYPES (TREE_TYPE (decl));
3077 161 : /* tree ptrtype = TREE_VALUE (arglist); */ arglist = TREE_CHAIN (arglist);
3078 161 : tree masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
3079 161 : tree idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
3080 161 : tree srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
3081 161 : tree scaletype = TREE_VALUE (arglist);
3082 161 : gcc_checking_assert (TREE_CODE (masktype) == INTEGER_TYPE
3083 : && TREE_CODE (rettype) == VOID_TYPE);
3084 :
3085 161 : tree mask_arg = NULL_TREE;
3086 161 : if (mask)
3087 : {
3088 110 : mask_arg = mask;
3089 110 : tree optype = TREE_TYPE (mask_arg);
3090 110 : tree utype;
3091 110 : if (TYPE_MODE (masktype) == TYPE_MODE (optype))
3092 : utype = masktype;
3093 : else
3094 8 : utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1);
3095 110 : tree var = vect_get_new_ssa_name (utype, vect_scalar_var);
3096 110 : mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_arg);
3097 110 : gassign *new_stmt
3098 110 : = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg);
3099 110 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3100 110 : mask_arg = var;
3101 110 : if (!useless_type_conversion_p (masktype, utype))
3102 : {
3103 8 : gcc_assert (TYPE_PRECISION (utype) <= TYPE_PRECISION (masktype));
3104 8 : tree var = vect_get_new_ssa_name (masktype, vect_scalar_var);
3105 8 : new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg);
3106 8 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3107 8 : mask_arg = var;
3108 : }
3109 : }
3110 : else
3111 : {
3112 51 : mask_arg = build_int_cst (masktype, -1);
3113 51 : mask_arg = vect_init_vector (vinfo, stmt_info, mask_arg, masktype, NULL);
3114 : }
3115 :
3116 161 : tree src = oprnd;
3117 161 : if (!useless_type_conversion_p (srctype, TREE_TYPE (src)))
3118 : {
3119 0 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (src)),
3120 : TYPE_VECTOR_SUBPARTS (srctype)));
3121 0 : tree var = vect_get_new_ssa_name (srctype, vect_simple_var);
3122 0 : src = build1 (VIEW_CONVERT_EXPR, srctype, src);
3123 0 : gassign *new_stmt = gimple_build_assign (var, VIEW_CONVERT_EXPR, src);
3124 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3125 0 : src = var;
3126 : }
3127 :
3128 161 : tree op = offset;
3129 161 : if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
3130 : {
3131 16 : gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op)),
3132 : TYPE_VECTOR_SUBPARTS (idxtype)));
3133 16 : tree var = vect_get_new_ssa_name (idxtype, vect_simple_var);
3134 16 : op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
3135 16 : gassign *new_stmt = gimple_build_assign (var, VIEW_CONVERT_EXPR, op);
3136 16 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3137 16 : op = var;
3138 : }
3139 :
3140 161 : tree scale = build_int_cst (scaletype, SLP_TREE_GS_SCALE (slp_node));
3141 161 : gcall *new_stmt
3142 161 : = gimple_build_call (decl, 5, ptr, mask_arg, op, src, scale);
3143 161 : return new_stmt;
3144 : }
3145 :
3146 : /* Prepare the base and offset in GS_INFO for vectorization.
3147 : Set *DATAREF_PTR to the loop-invariant base address and *VEC_OFFSET
3148 : to the vectorized offset argument for the first copy of STMT_INFO.
3149 : STMT_INFO is the statement described by GS_INFO and LOOP is the
3150 : containing loop. */
3151 :
3152 : static void
3153 1236 : vect_get_gather_scatter_ops (class loop *loop, slp_tree slp_node,
3154 : tree *dataref_ptr, vec<tree> *vec_offset)
3155 : {
3156 1236 : gimple_seq stmts = NULL;
3157 1236 : *dataref_ptr = force_gimple_operand (SLP_TREE_GS_BASE (slp_node),
3158 : &stmts, true, NULL_TREE);
3159 1236 : if (stmts != NULL)
3160 : {
3161 1003 : basic_block new_bb;
3162 1003 : edge pe = loop_preheader_edge (loop);
3163 1003 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3164 1003 : gcc_assert (!new_bb);
3165 : }
3166 1236 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], vec_offset);
3167 1236 : }
3168 :
3169 : /* Prepare to implement a grouped or strided load or store using
3170 : the gather load or scatter store operation described by GS_INFO.
3171 : STMT_INFO is the load or store statement.
3172 :
3173 : Set *DATAREF_BUMP to the amount that should be added to the base
3174 : address after each copy of the vectorized statement. Set *VEC_OFFSET
3175 : to an invariant offset vector in which element I has the value
3176 : I * DR_STEP / SCALE. */
3177 :
3178 : static void
3179 0 : vect_get_strided_load_store_ops (stmt_vec_info stmt_info, slp_tree node,
3180 : tree vectype, tree offset_vectype,
3181 : loop_vec_info loop_vinfo,
3182 : gimple_stmt_iterator *gsi,
3183 : tree *dataref_bump, tree *vec_offset,
3184 : vec_loop_lens *loop_lens)
3185 : {
3186 0 : struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3187 :
3188 0 : if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
3189 : {
3190 : /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
3191 : ivtmp_8 = _31 * 16 (step in bytes);
3192 : .MASK_LEN_SCATTER_STORE (vectp_a.9_7, ... );
3193 : vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */
3194 0 : tree loop_len
3195 0 : = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0, true);
3196 0 : tree tmp
3197 0 : = fold_build2 (MULT_EXPR, sizetype,
3198 : fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
3199 : loop_len);
3200 0 : *dataref_bump = force_gimple_operand_gsi (gsi, tmp, true, NULL_TREE, true,
3201 : GSI_SAME_STMT);
3202 : }
3203 : else
3204 : {
3205 0 : tree bump
3206 0 : = size_binop (MULT_EXPR,
3207 : fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
3208 : size_int (TYPE_VECTOR_SUBPARTS (vectype)));
3209 0 : *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
3210 : }
3211 :
3212 0 : internal_fn ifn
3213 0 : = DR_IS_READ (dr) ? IFN_MASK_LEN_STRIDED_LOAD : IFN_MASK_LEN_STRIDED_STORE;
3214 0 : if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
3215 : {
3216 0 : *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo,
3217 : unshare_expr (DR_STEP (dr)));
3218 0 : return;
3219 : }
3220 :
3221 : /* The offset given in GS_INFO can have pointer type, so use the element
3222 : type of the vector instead. */
3223 0 : tree offset_type = TREE_TYPE (offset_vectype);
3224 :
3225 : /* Calculate X = DR_STEP / SCALE and convert it to the appropriate type. */
3226 0 : tree step = size_binop (EXACT_DIV_EXPR, unshare_expr (DR_STEP (dr)),
3227 : ssize_int (SLP_TREE_GS_SCALE (node)));
3228 0 : step = fold_convert (offset_type, step);
3229 :
3230 : /* Create {0, X, X*2, X*3, ...}. */
3231 0 : tree offset = fold_build2 (VEC_SERIES_EXPR, offset_vectype,
3232 : build_zero_cst (offset_type), step);
3233 0 : *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo, offset);
3234 : }
3235 :
3236 : /* Prepare the pointer IVs which needs to be updated by a variable amount.
3237 : Such variable amount is the outcome of .SELECT_VL. In this case, we can
3238 : allow each iteration process the flexible number of elements as long as
3239 : the number <= vf elements.
3240 :
3241 : Return data reference according to SELECT_VL.
3242 : If new statements are needed, insert them before GSI. */
3243 :
3244 : static tree
3245 0 : vect_get_loop_variant_data_ptr_increment (
3246 : vec_info *vinfo, tree aggr_type, gimple_stmt_iterator *gsi,
3247 : vec_loop_lens *loop_lens, dr_vec_info *dr_info,
3248 : vect_memory_access_type memory_access_type)
3249 : {
3250 0 : loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
3251 0 : tree step = vect_dr_behavior (vinfo, dr_info)->step;
3252 :
3253 : /* gather/scatter never reach here. */
3254 0 : gcc_assert (!mat_gather_scatter_p (memory_access_type));
3255 :
3256 : /* When we support SELECT_VL pattern, we dynamic adjust
3257 : the memory address by .SELECT_VL result.
3258 :
3259 : The result of .SELECT_VL is the number of elements to
3260 : be processed of each iteration. So the memory address
3261 : adjustment operation should be:
3262 :
3263 : addr = addr + .SELECT_VL (ARG..) * step;
3264 : */
3265 0 : tree loop_len
3266 0 : = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, aggr_type, 0, 0, true);
3267 0 : tree len_type = TREE_TYPE (loop_len);
3268 : /* Since the outcome of .SELECT_VL is element size, we should adjust
3269 : it into bytesize so that it can be used in address pointer variable
3270 : amount IVs adjustment. */
3271 0 : tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
3272 : wide_int_to_tree (len_type, wi::to_widest (step)));
3273 0 : tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
3274 0 : gassign *assign = gimple_build_assign (bump, tmp);
3275 0 : gsi_insert_before (gsi, assign, GSI_SAME_STMT);
3276 0 : return bump;
3277 : }
3278 :
3279 : /* Return the amount that should be added to a vector pointer to move
3280 : to the next or previous copy of AGGR_TYPE. DR_INFO is the data reference
3281 : being vectorized and MEMORY_ACCESS_TYPE describes the type of
3282 : vectorization. */
3283 :
3284 : static tree
3285 703494 : vect_get_data_ptr_increment (vec_info *vinfo, gimple_stmt_iterator *gsi,
3286 : dr_vec_info *dr_info, tree aggr_type,
3287 : vect_memory_access_type memory_access_type,
3288 : vec_loop_lens *loop_lens)
3289 : {
3290 703494 : if (memory_access_type == VMAT_INVARIANT)
3291 0 : return size_zero_node;
3292 :
3293 703494 : loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
3294 134869 : if (loop_vinfo && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
3295 0 : return vect_get_loop_variant_data_ptr_increment (vinfo, aggr_type, gsi,
3296 : loop_lens, dr_info,
3297 0 : memory_access_type);
3298 :
3299 703494 : tree iv_step = TYPE_SIZE_UNIT (aggr_type);
3300 703494 : tree step = vect_dr_behavior (vinfo, dr_info)->step;
3301 703494 : if (tree_int_cst_sgn (step) == -1)
3302 2842 : iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
3303 : return iv_step;
3304 : }
3305 :
3306 : /* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64,128}. */
3307 :
3308 : static bool
3309 126 : vectorizable_bswap (vec_info *vinfo,
3310 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
3311 : slp_tree slp_node,
3312 : slp_tree *slp_op,
3313 : tree vectype_in, stmt_vector_for_cost *cost_vec)
3314 : {
3315 126 : tree op, vectype;
3316 126 : gcall *stmt = as_a <gcall *> (stmt_info->stmt);
3317 :
3318 126 : op = gimple_call_arg (stmt, 0);
3319 126 : vectype = SLP_TREE_VECTYPE (slp_node);
3320 126 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3321 :
3322 126 : if (TYPE_SIZE (vectype_in) != TYPE_SIZE (vectype))
3323 : {
3324 0 : if (dump_enabled_p ())
3325 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3326 : "mismatched vector sizes %T and %T\n",
3327 : vectype_in, vectype);
3328 0 : return false;
3329 : }
3330 :
3331 126 : tree char_vectype = get_same_sized_vectype (char_type_node, vectype_in);
3332 126 : if (! char_vectype)
3333 : return false;
3334 :
3335 126 : poly_uint64 num_bytes = TYPE_VECTOR_SUBPARTS (char_vectype);
3336 126 : unsigned word_bytes;
3337 126 : if (!constant_multiple_p (num_bytes, nunits, &word_bytes))
3338 : return false;
3339 :
3340 : /* The encoding uses one stepped pattern for each byte in the word. */
3341 126 : vec_perm_builder elts (num_bytes, word_bytes, 3);
3342 504 : for (unsigned i = 0; i < 3; ++i)
3343 2274 : for (unsigned j = 0; j < word_bytes; ++j)
3344 1896 : elts.quick_push ((i + 1) * word_bytes - j - 1);
3345 :
3346 126 : vec_perm_indices indices (elts, 1, num_bytes);
3347 126 : machine_mode vmode = TYPE_MODE (char_vectype);
3348 126 : if (!can_vec_perm_const_p (vmode, vmode, indices))
3349 : return false;
3350 :
3351 59 : if (cost_vec)
3352 : {
3353 47 : if (!vect_maybe_update_slp_op_vectype (slp_op[0], vectype_in))
3354 : {
3355 0 : if (dump_enabled_p ())
3356 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3357 : "incompatible vector types for invariants\n");
3358 0 : return false;
3359 : }
3360 :
3361 47 : SLP_TREE_TYPE (slp_node) = call_vec_info_type;
3362 47 : DUMP_VECT_SCOPE ("vectorizable_bswap");
3363 47 : record_stmt_cost (cost_vec,
3364 : 1, vector_stmt, slp_node, 0, vect_prologue);
3365 47 : record_stmt_cost (cost_vec,
3366 47 : vect_get_num_copies (vinfo, slp_node),
3367 : vec_perm, slp_node, 0, vect_body);
3368 47 : return true;
3369 : }
3370 :
3371 12 : tree bswap_vconst = vec_perm_indices_to_tree (char_vectype, indices);
3372 :
3373 : /* Transform. */
3374 12 : vec<tree> vec_oprnds = vNULL;
3375 12 : vect_get_vec_defs (vinfo, slp_node, op, &vec_oprnds);
3376 : /* Arguments are ready. create the new vector stmt. */
3377 12 : unsigned i;
3378 12 : tree vop;
3379 24 : FOR_EACH_VEC_ELT (vec_oprnds, i, vop)
3380 : {
3381 12 : gimple *new_stmt;
3382 12 : tree tem = make_ssa_name (char_vectype);
3383 12 : new_stmt = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR,
3384 : char_vectype, vop));
3385 12 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3386 12 : tree tem2 = make_ssa_name (char_vectype);
3387 12 : new_stmt = gimple_build_assign (tem2, VEC_PERM_EXPR,
3388 : tem, tem, bswap_vconst);
3389 12 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3390 12 : tem = make_ssa_name (vectype);
3391 12 : new_stmt = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR,
3392 : vectype, tem2));
3393 12 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3394 12 : slp_node->push_vec_def (new_stmt);
3395 : }
3396 :
3397 12 : vec_oprnds.release ();
3398 12 : return true;
3399 126 : }
3400 :
3401 : /* Return true if vector types VECTYPE_IN and VECTYPE_OUT have
3402 : integer elements and if we can narrow VECTYPE_IN to VECTYPE_OUT
3403 : in a single step. On success, store the binary pack code in
3404 : *CONVERT_CODE. */
3405 :
3406 : static bool
3407 184 : simple_integer_narrowing (tree vectype_out, tree vectype_in,
3408 : code_helper *convert_code)
3409 : {
3410 368 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype_out))
3411 368 : || !INTEGRAL_TYPE_P (TREE_TYPE (vectype_in)))
3412 : return false;
3413 :
3414 74 : code_helper code;
3415 74 : int multi_step_cvt = 0;
3416 74 : auto_vec <tree, 8> interm_types;
3417 107 : if (!supportable_narrowing_operation (NOP_EXPR, vectype_out, vectype_in,
3418 : &code, &multi_step_cvt, &interm_types)
3419 74 : || multi_step_cvt)
3420 33 : return false;
3421 :
3422 41 : *convert_code = code;
3423 41 : return true;
3424 74 : }
3425 :
3426 : /* Function vectorizable_call.
3427 :
3428 : Check if STMT_INFO performs a function call that can be vectorized.
3429 : If COST_VEC is passed, calculate costs but don't change anything,
3430 : otherwise, vectorize STMT_INFO: create a vectorized stmt to replace
3431 : it, and insert it at GSI.
3432 : Return true if STMT_INFO is vectorizable in this way. */
3433 :
3434 : static bool
3435 2667715 : vectorizable_call (vec_info *vinfo,
3436 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
3437 : slp_tree slp_node,
3438 : stmt_vector_for_cost *cost_vec)
3439 : {
3440 2667715 : gcall *stmt;
3441 2667715 : tree vec_dest;
3442 2667715 : tree scalar_dest;
3443 2667715 : tree op;
3444 2667715 : tree vec_oprnd0 = NULL_TREE;
3445 2667715 : tree vectype_out, vectype_in;
3446 2667715 : poly_uint64 nunits_in;
3447 2667715 : poly_uint64 nunits_out;
3448 2667715 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
3449 2667715 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
3450 2667715 : tree fndecl, new_temp, rhs_type;
3451 2667715 : enum vect_def_type dt[5]
3452 : = { vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type,
3453 : vect_unknown_def_type, vect_unknown_def_type };
3454 2667715 : tree vectypes[ARRAY_SIZE (dt)] = {};
3455 2667715 : slp_tree slp_op[ARRAY_SIZE (dt)] = {};
3456 2667715 : auto_vec<tree, 8> vargs;
3457 2667715 : enum { NARROW, NONE, WIDEN } modifier;
3458 2667715 : size_t i, nargs;
3459 2667715 : tree clz_ctz_arg1 = NULL_TREE;
3460 :
3461 2667715 : if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
3462 : return false;
3463 :
3464 2667715 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
3465 234683 : && cost_vec)
3466 : return false;
3467 :
3468 : /* Is STMT_INFO a vectorizable call? */
3469 2681451 : stmt = dyn_cast <gcall *> (stmt_info->stmt);
3470 24880 : if (!stmt)
3471 : return false;
3472 :
3473 24880 : if (gimple_call_internal_p (stmt)
3474 24880 : && (internal_load_fn_p (gimple_call_internal_fn (stmt))
3475 16468 : || internal_store_fn_p (gimple_call_internal_fn (stmt))))
3476 : /* Handled by vectorizable_load and vectorizable_store. */
3477 3815 : return false;
3478 :
3479 21065 : if (gimple_call_lhs (stmt) == NULL_TREE
3480 21065 : || TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
3481 : return false;
3482 :
3483 21059 : gcc_checking_assert (!stmt_can_throw_internal (cfun, stmt));
3484 :
3485 21059 : vectype_out = SLP_TREE_VECTYPE (slp_node);
3486 :
3487 : /* Process function arguments. */
3488 21059 : rhs_type = NULL_TREE;
3489 21059 : vectype_in = NULL_TREE;
3490 21059 : nargs = gimple_call_num_args (stmt);
3491 :
3492 : /* Bail out if the function has more than four arguments, we do not have
3493 : interesting builtin functions to vectorize with more than two arguments
3494 : except for fma (cond_fma has more). No arguments is also not good. */
3495 21059 : if (nargs == 0 || nargs > 5)
3496 : return false;
3497 :
3498 : /* Ignore the arguments of IFN_GOMP_SIMD_LANE, they are magic. */
3499 20979 : combined_fn cfn = gimple_call_combined_fn (stmt);
3500 20979 : if (cfn == CFN_GOMP_SIMD_LANE)
3501 : {
3502 3207 : nargs = 0;
3503 3207 : rhs_type = unsigned_type_node;
3504 : }
3505 : /* Similarly pretend IFN_CLZ and IFN_CTZ only has one argument, the second
3506 : argument just says whether it is well-defined at zero or not and what
3507 : value should be returned for it. */
3508 20979 : if ((cfn == CFN_CLZ || cfn == CFN_CTZ) && nargs == 2)
3509 : {
3510 168 : nargs = 1;
3511 168 : clz_ctz_arg1 = gimple_call_arg (stmt, 1);
3512 : }
3513 :
3514 20979 : int mask_opno = -1;
3515 20979 : if (internal_fn_p (cfn))
3516 : {
3517 : /* We can only handle direct internal masked calls here,
3518 : vectorizable_simd_clone_call is for the rest. */
3519 17985 : if (cfn == CFN_MASK_CALL)
3520 : return false;
3521 17831 : mask_opno = internal_fn_mask_index (as_internal_fn (cfn));
3522 : }
3523 :
3524 65773 : for (i = 0; i < nargs; i++)
3525 : {
3526 46234 : if ((int) i == mask_opno)
3527 : {
3528 7694 : if (!vect_check_scalar_mask (vinfo, slp_node, mask_opno,
3529 : &slp_op[i], &dt[i], &vectypes[i]))
3530 : return false;
3531 7694 : continue;
3532 : }
3533 :
3534 38540 : if (!vect_is_simple_use (vinfo, slp_node,
3535 : i, &op, &slp_op[i], &dt[i], &vectypes[i]))
3536 : {
3537 0 : if (dump_enabled_p ())
3538 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3539 : "use not simple.\n");
3540 0 : return false;
3541 : }
3542 :
3543 : /* We can only handle calls with arguments of the same type. */
3544 38540 : if (rhs_type
3545 38540 : && !types_compatible_p (rhs_type, TREE_TYPE (op)))
3546 : {
3547 1286 : if (dump_enabled_p ())
3548 200 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3549 : "argument types differ.\n");
3550 1286 : return false;
3551 : }
3552 37254 : if (!rhs_type)
3553 17618 : rhs_type = TREE_TYPE (op);
3554 :
3555 37254 : if (!vectype_in)
3556 18158 : vectype_in = vectypes[i];
3557 19096 : else if (vectypes[i]
3558 19096 : && !types_compatible_p (vectypes[i], vectype_in))
3559 : {
3560 0 : if (dump_enabled_p ())
3561 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3562 : "argument vector types differ.\n");
3563 0 : return false;
3564 : }
3565 : }
3566 : /* If all arguments are external or constant defs, infer the vector type
3567 : from the scalar type. */
3568 19539 : if (!vectype_in)
3569 5504 : vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type, slp_node);
3570 19539 : if (!cost_vec)
3571 4193 : gcc_assert (vectype_in);
3572 15346 : if (!vectype_in)
3573 : {
3574 1029 : if (dump_enabled_p ())
3575 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3576 : "no vectype for scalar type %T\n", rhs_type);
3577 :
3578 1029 : return false;
3579 : }
3580 :
3581 37020 : if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
3582 18510 : != VECTOR_BOOLEAN_TYPE_P (vectype_in))
3583 : {
3584 12 : if (dump_enabled_p ())
3585 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3586 : "mixed mask and nonmask vector types\n");
3587 12 : return false;
3588 : }
3589 :
3590 18498 : if (vect_emulated_vector_p (vectype_in)
3591 18498 : || vect_emulated_vector_p (vectype_out))
3592 : {
3593 0 : if (dump_enabled_p ())
3594 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3595 : "use emulated vector type for call\n");
3596 0 : return false;
3597 : }
3598 :
3599 : /* FORNOW */
3600 18498 : nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
3601 18498 : nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
3602 18498 : if (known_eq (nunits_in * 2, nunits_out))
3603 : modifier = NARROW;
3604 17927 : else if (known_eq (nunits_out, nunits_in))
3605 : modifier = NONE;
3606 45 : else if (known_eq (nunits_out * 2, nunits_in))
3607 : modifier = WIDEN;
3608 : else
3609 : return false;
3610 :
3611 : /* We only handle functions that do not read or clobber memory. */
3612 36996 : if (gimple_vuse (stmt))
3613 : {
3614 1218 : if (dump_enabled_p ())
3615 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3616 : "function reads from or writes to memory.\n");
3617 1218 : return false;
3618 : }
3619 :
3620 : /* For now, we only vectorize functions if a target specific builtin
3621 : is available. TODO -- in some cases, it might be profitable to
3622 : insert the calls for pieces of the vector, in order to be able
3623 : to vectorize other operations in the loop. */
3624 17280 : fndecl = NULL_TREE;
3625 17280 : internal_fn ifn = IFN_LAST;
3626 17280 : tree callee = gimple_call_fndecl (stmt);
3627 :
3628 : /* First try using an internal function. */
3629 17280 : code_helper convert_code = MAX_TREE_CODES;
3630 17280 : if (cfn != CFN_LAST
3631 17280 : && (modifier == NONE
3632 196 : || (modifier == NARROW
3633 184 : && simple_integer_narrowing (vectype_out, vectype_in,
3634 : &convert_code))))
3635 16255 : ifn = vectorizable_internal_function (cfn, callee, vectype_out,
3636 : vectype_in);
3637 :
3638 : /* Check if the operation traps. */
3639 17280 : bool could_trap = gimple_could_trap_p (STMT_VINFO_STMT (stmt_info));
3640 17280 : if (could_trap && cost_vec && loop_vinfo)
3641 : {
3642 : /* If the operation can trap it must be conditional, otherwise fail. */
3643 474 : internal_fn cond_fn = (internal_fn_mask_index (ifn) != -1
3644 474 : ? ifn : get_conditional_internal_fn (ifn));
3645 474 : internal_fn cond_len_fn = get_len_internal_fn (cond_fn);
3646 474 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3647 : {
3648 : /* We assume that BB SLP fills all lanes, so no inactive lanes can
3649 : cause issues. */
3650 84 : if ((cond_fn == IFN_LAST
3651 56 : || !direct_internal_fn_supported_p (cond_fn, vectype_out,
3652 : OPTIMIZE_FOR_SPEED))
3653 140 : && (cond_len_fn == IFN_LAST
3654 56 : || !direct_internal_fn_supported_p (cond_len_fn, vectype_out,
3655 : OPTIMIZE_FOR_SPEED)))
3656 : {
3657 84 : if (dump_enabled_p ())
3658 10 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3659 : "can't use a fully-masked loop because no"
3660 : " conditional operation is available.\n");
3661 84 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3662 : }
3663 : }
3664 : }
3665 :
3666 : /* If that fails, try asking for a target-specific built-in function. */
3667 17280 : if (ifn == IFN_LAST)
3668 : {
3669 9849 : if (cfn != CFN_LAST)
3670 8979 : fndecl = targetm.vectorize.builtin_vectorized_function
3671 8979 : (cfn, vectype_out, vectype_in);
3672 870 : else if (callee && fndecl_built_in_p (callee, BUILT_IN_MD))
3673 24 : fndecl = targetm.vectorize.builtin_md_vectorized_function
3674 24 : (callee, vectype_out, vectype_in);
3675 : }
3676 :
3677 17280 : if (ifn == IFN_LAST && !fndecl)
3678 : {
3679 9469 : if (cfn == CFN_GOMP_SIMD_LANE
3680 3207 : && SLP_TREE_LANES (slp_node) == 1
3681 3207 : && loop_vinfo
3682 3207 : && LOOP_VINFO_LOOP (loop_vinfo)->simduid
3683 3207 : && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
3684 15883 : && LOOP_VINFO_LOOP (loop_vinfo)->simduid
3685 3207 : == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
3686 : {
3687 : /* We can handle IFN_GOMP_SIMD_LANE by returning a
3688 : { 0, 1, 2, ... vf - 1 } vector. */
3689 3207 : gcc_assert (nargs == 0);
3690 : }
3691 6262 : else if (modifier == NONE
3692 6262 : && (gimple_call_builtin_p (stmt, BUILT_IN_BSWAP16)
3693 5924 : || gimple_call_builtin_p (stmt, BUILT_IN_BSWAP32)
3694 5869 : || gimple_call_builtin_p (stmt, BUILT_IN_BSWAP64)
3695 5824 : || gimple_call_builtin_p (stmt, BUILT_IN_BSWAP128)))
3696 126 : return vectorizable_bswap (vinfo, stmt_info, gsi, slp_node,
3697 126 : slp_op, vectype_in, cost_vec);
3698 : else
3699 : {
3700 6136 : if (dump_enabled_p ())
3701 274 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3702 : "function is not vectorizable.\n");
3703 6136 : return false;
3704 : }
3705 : }
3706 :
3707 11018 : int reduc_idx = SLP_TREE_REDUC_IDX (slp_node);
3708 11018 : internal_fn cond_fn = (internal_fn_mask_index (ifn) != -1
3709 11018 : ? ifn : get_conditional_internal_fn (ifn));
3710 11018 : internal_fn cond_len_fn = get_len_internal_fn (cond_fn);
3711 11018 : vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
3712 9158 : vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
3713 11018 : unsigned int nvectors = vect_get_num_copies (vinfo, slp_node);
3714 11018 : if (cost_vec) /* transformation not required. */
3715 : {
3716 21685 : for (i = 0; i < nargs; ++i)
3717 14848 : if (!vect_maybe_update_slp_op_vectype (slp_op[i],
3718 14848 : vectypes[i]
3719 : ? vectypes[i] : vectype_in))
3720 : {
3721 0 : if (dump_enabled_p ())
3722 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3723 : "incompatible vector types for invariants\n");
3724 0 : return false;
3725 : }
3726 6837 : SLP_TREE_TYPE (slp_node) = call_vec_info_type;
3727 6837 : DUMP_VECT_SCOPE ("vectorizable_call");
3728 6837 : vect_model_simple_cost (vinfo, 1, slp_node, cost_vec);
3729 :
3730 6837 : if (loop_vinfo
3731 5903 : && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3732 4056 : && (reduc_idx >= 0 || could_trap || mask_opno >= 0))
3733 : {
3734 2558 : if (reduc_idx >= 0
3735 1631 : && (cond_fn == IFN_LAST
3736 1631 : || !direct_internal_fn_supported_p (cond_fn, vectype_out,
3737 : OPTIMIZE_FOR_SPEED))
3738 2570 : && (cond_len_fn == IFN_LAST
3739 12 : || !direct_internal_fn_supported_p (cond_len_fn, vectype_out,
3740 : OPTIMIZE_FOR_SPEED)))
3741 : {
3742 12 : if (dump_enabled_p ())
3743 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3744 : "can't use a fully-masked loop because no"
3745 : " conditional operation is available.\n");
3746 12 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3747 : }
3748 : else
3749 : {
3750 2546 : tree scalar_mask = NULL_TREE;
3751 2546 : if (mask_opno >= 0)
3752 2546 : scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno);
3753 2546 : if (cond_len_fn != IFN_LAST
3754 2546 : && direct_internal_fn_supported_p (cond_len_fn, vectype_out,
3755 : OPTIMIZE_FOR_SPEED))
3756 0 : vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_out,
3757 : 1);
3758 : else
3759 2546 : vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out,
3760 : scalar_mask);
3761 : }
3762 : }
3763 6837 : return true;
3764 : }
3765 :
3766 : /* Transform. */
3767 :
3768 4181 : if (dump_enabled_p ())
3769 416 : dump_printf_loc (MSG_NOTE, vect_location, "transform call.\n");
3770 :
3771 : /* Handle def. */
3772 4181 : scalar_dest = gimple_call_lhs (stmt);
3773 4181 : vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
3774 :
3775 4181 : bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
3776 3255 : bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
3777 4181 : unsigned int vect_nargs = nargs;
3778 4181 : if (len_loop_p && (reduc_idx >= 0 || could_trap || mask_opno >= 0))
3779 : {
3780 0 : ifn = cond_len_fn;
3781 : /* COND_* -> COND_LEN_* takes 2 extra arguments:LEN,BIAS. */
3782 0 : vect_nargs += 2;
3783 : /* But unless there's a mask argument already we need that
3784 : as well, and an else value. */
3785 0 : if (mask_opno == -1)
3786 0 : vect_nargs += 2;
3787 : }
3788 4181 : else if (masked_loop_p && mask_opno == -1 && (reduc_idx >= 0 || could_trap))
3789 : {
3790 0 : ifn = cond_fn;
3791 0 : vect_nargs += 2;
3792 : }
3793 4181 : int len_opno = internal_fn_len_index (ifn);
3794 4181 : if (clz_ctz_arg1)
3795 59 : ++vect_nargs;
3796 :
3797 4181 : if (modifier == NONE || ifn != IFN_LAST)
3798 : {
3799 4149 : tree prev_res = NULL_TREE;
3800 4149 : vargs.safe_grow (vect_nargs, true);
3801 4149 : auto_vec<vec<tree> > vec_defs (nargs);
3802 :
3803 : /* Build argument list for the vectorized call. */
3804 4149 : if (cfn == CFN_GOMP_SIMD_LANE)
3805 : {
3806 3308 : for (i = 0; i < nvectors; ++i)
3807 : {
3808 : /* ??? For multi-lane SLP we'd need to build
3809 : { 0, 0, .., 1, 1, ... }. */
3810 1708 : tree cst = build_index_vector (vectype_out,
3811 : i * nunits_out, 1);
3812 1708 : tree new_var
3813 1708 : = vect_get_new_ssa_name (vectype_out, vect_simple_var, "cst_");
3814 1708 : gimple *init_stmt = gimple_build_assign (new_var, cst);
3815 1708 : vect_init_vector_1 (vinfo, stmt_info, init_stmt, NULL);
3816 1708 : new_temp = make_ssa_name (vec_dest);
3817 1708 : gimple *new_stmt = gimple_build_assign (new_temp, new_var);
3818 1708 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3819 1708 : slp_node->push_vec_def (new_stmt);
3820 : }
3821 : }
3822 : else
3823 : {
3824 2549 : vec<tree> vec_oprnds0;
3825 2549 : vect_get_slp_defs (vinfo, slp_node, &vec_defs);
3826 2549 : vec_oprnds0 = vec_defs[0];
3827 :
3828 : /* Arguments are ready. Create the new vector stmt. */
3829 5251 : FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_oprnd0)
3830 : {
3831 2702 : int varg = 0;
3832 : /* Add the mask if necessary. */
3833 38 : if ((masked_loop_p || len_loop_p) && mask_opno == -1
3834 2704 : && internal_fn_mask_index (ifn) != -1)
3835 : {
3836 0 : gcc_assert (internal_fn_mask_index (ifn) == varg);
3837 0 : if (masked_loop_p)
3838 : {
3839 0 : unsigned int vec_num = vec_oprnds0.length ();
3840 0 : vargs[varg++] = vect_get_loop_mask (loop_vinfo, gsi,
3841 : masks, vec_num,
3842 : vectype_out, i);
3843 : }
3844 : else
3845 : {
3846 0 : tree mask_vectype = truth_type_for (vectype_out);
3847 0 : vargs[varg++] = vect_build_all_ones_mask (loop_vinfo,
3848 : stmt_info,
3849 : mask_vectype);
3850 : }
3851 : }
3852 : size_t k;
3853 9911 : for (k = 0; k < nargs; k++)
3854 : {
3855 7209 : vec<tree> vec_oprndsk = vec_defs[k];
3856 7209 : vargs[varg++] = vec_oprndsk[i];
3857 : }
3858 : /* Add the else value if necessary. */
3859 38 : if ((masked_loop_p || len_loop_p) && mask_opno == -1
3860 2704 : && internal_fn_else_index (ifn) != -1)
3861 : {
3862 0 : gcc_assert (internal_fn_else_index (ifn) == varg);
3863 0 : if (reduc_idx >= 0)
3864 0 : vargs[varg++] = vargs[reduc_idx + 1];
3865 : else
3866 : {
3867 0 : auto else_value = targetm.preferred_else_value
3868 0 : (ifn, vectype_out, varg - 1, &vargs[1]);
3869 0 : vargs[varg++] = else_value;
3870 : }
3871 : }
3872 2702 : if (clz_ctz_arg1)
3873 59 : vargs[varg++] = clz_ctz_arg1;
3874 :
3875 2702 : gimple *new_stmt;
3876 2702 : if (modifier == NARROW)
3877 : {
3878 : /* We don't define any narrowing conditional functions
3879 : at present. */
3880 0 : gcc_assert (mask_opno < 0);
3881 0 : tree half_res = make_ssa_name (vectype_in);
3882 0 : gcall *call = gimple_build_call_internal_vec (ifn, vargs);
3883 0 : gimple_call_set_lhs (call, half_res);
3884 0 : gimple_call_set_nothrow (call, true);
3885 0 : vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
3886 0 : if ((i & 1) == 0)
3887 : {
3888 0 : prev_res = half_res;
3889 0 : continue;
3890 : }
3891 0 : new_temp = make_ssa_name (vec_dest);
3892 0 : new_stmt = vect_gimple_build (new_temp, convert_code,
3893 : prev_res, half_res);
3894 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
3895 : }
3896 : else
3897 : {
3898 2702 : if (len_opno >= 0 && len_loop_p)
3899 : {
3900 0 : unsigned int vec_num = vec_oprnds0.length ();
3901 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
3902 : vec_num, vectype_out, i, 1, true);
3903 0 : signed char biasval
3904 0 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
3905 0 : tree bias = build_int_cst (intQI_type_node, biasval);
3906 0 : vargs[len_opno] = len;
3907 0 : vargs[len_opno + 1] = bias;
3908 : }
3909 2702 : else if (mask_opno >= 0 && masked_loop_p)
3910 : {
3911 36 : unsigned int vec_num = vec_oprnds0.length ();
3912 36 : tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
3913 : vec_num, vectype_out, i);
3914 36 : vargs[mask_opno]
3915 72 : = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
3916 36 : vargs[mask_opno], gsi);
3917 : }
3918 :
3919 2702 : gcall *call;
3920 2702 : if (ifn != IFN_LAST)
3921 2621 : call = gimple_build_call_internal_vec (ifn, vargs);
3922 : else
3923 81 : call = gimple_build_call_vec (fndecl, vargs);
3924 2702 : new_temp = make_ssa_name (vec_dest, call);
3925 2702 : gimple_call_set_lhs (call, new_temp);
3926 2702 : gimple_call_set_nothrow (call, true);
3927 2702 : vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
3928 2702 : new_stmt = call;
3929 : }
3930 2702 : slp_node->push_vec_def (new_stmt);
3931 : }
3932 : }
3933 :
3934 10996 : for (i = 0; i < nargs; i++)
3935 : {
3936 6847 : vec<tree> vec_oprndsi = vec_defs[i];
3937 6847 : vec_oprndsi.release ();
3938 : }
3939 4149 : }
3940 32 : else if (modifier == NARROW)
3941 : {
3942 32 : auto_vec<vec<tree> > vec_defs (nargs);
3943 : /* We don't define any narrowing conditional functions at present. */
3944 32 : gcc_assert (mask_opno < 0);
3945 :
3946 : /* Build argument list for the vectorized call. */
3947 32 : vargs.create (nargs * 2);
3948 :
3949 32 : vect_get_slp_defs (vinfo, slp_node, &vec_defs);
3950 32 : vec<tree> vec_oprnds0 = vec_defs[0];
3951 :
3952 : /* Arguments are ready. Create the new vector stmt. */
3953 64 : for (i = 0; vec_oprnds0.iterate (i, &vec_oprnd0); i += 2)
3954 : {
3955 32 : size_t k;
3956 32 : vargs.truncate (0);
3957 64 : for (k = 0; k < nargs; k++)
3958 : {
3959 32 : vec<tree> vec_oprndsk = vec_defs[k];
3960 32 : vargs.quick_push (vec_oprndsk[i]);
3961 32 : vargs.quick_push (vec_oprndsk[i + 1]);
3962 : }
3963 32 : gcall *call;
3964 32 : if (ifn != IFN_LAST)
3965 : call = gimple_build_call_internal_vec (ifn, vargs);
3966 : else
3967 32 : call = gimple_build_call_vec (fndecl, vargs);
3968 32 : new_temp = make_ssa_name (vec_dest, call);
3969 32 : gimple_call_set_lhs (call, new_temp);
3970 32 : gimple_call_set_nothrow (call, true);
3971 32 : vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
3972 32 : slp_node->push_vec_def (call);
3973 : }
3974 :
3975 64 : for (i = 0; i < nargs; i++)
3976 : {
3977 32 : vec<tree> vec_oprndsi = vec_defs[i];
3978 32 : vec_oprndsi.release ();
3979 : }
3980 32 : }
3981 : else
3982 : /* No current target implements this case. */
3983 : return false;
3984 :
3985 4181 : vargs.release ();
3986 :
3987 4181 : return true;
3988 2667715 : }
3989 :
3990 :
3991 : struct simd_call_arg_info
3992 : {
3993 : tree vectype;
3994 : tree op;
3995 : HOST_WIDE_INT linear_step;
3996 : enum vect_def_type dt;
3997 : unsigned int align;
3998 : bool simd_lane_linear;
3999 : };
4000 :
4001 : /* Helper function of vectorizable_simd_clone_call. If OP, an SSA_NAME,
4002 : is linear within simd lane (but not within whole loop), note it in
4003 : *ARGINFO. */
4004 :
4005 : static void
4006 15 : vect_simd_lane_linear (tree op, class loop *loop,
4007 : struct simd_call_arg_info *arginfo)
4008 : {
4009 15 : gimple *def_stmt = SSA_NAME_DEF_STMT (op);
4010 :
4011 15 : if (!is_gimple_assign (def_stmt)
4012 15 : || gimple_assign_rhs_code (def_stmt) != POINTER_PLUS_EXPR
4013 27 : || !is_gimple_min_invariant (gimple_assign_rhs1 (def_stmt)))
4014 3 : return;
4015 :
4016 12 : tree base = gimple_assign_rhs1 (def_stmt);
4017 12 : HOST_WIDE_INT linear_step = 0;
4018 12 : tree v = gimple_assign_rhs2 (def_stmt);
4019 48 : while (TREE_CODE (v) == SSA_NAME)
4020 : {
4021 36 : tree t;
4022 36 : def_stmt = SSA_NAME_DEF_STMT (v);
4023 36 : if (is_gimple_assign (def_stmt))
4024 24 : switch (gimple_assign_rhs_code (def_stmt))
4025 : {
4026 0 : case PLUS_EXPR:
4027 0 : t = gimple_assign_rhs2 (def_stmt);
4028 0 : if (linear_step || TREE_CODE (t) != INTEGER_CST)
4029 : return;
4030 0 : base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (base), base, t);
4031 0 : v = gimple_assign_rhs1 (def_stmt);
4032 0 : continue;
4033 12 : case MULT_EXPR:
4034 12 : t = gimple_assign_rhs2 (def_stmt);
4035 12 : if (linear_step || !tree_fits_shwi_p (t) || integer_zerop (t))
4036 0 : return;
4037 12 : linear_step = tree_to_shwi (t);
4038 12 : v = gimple_assign_rhs1 (def_stmt);
4039 12 : continue;
4040 12 : CASE_CONVERT:
4041 12 : t = gimple_assign_rhs1 (def_stmt);
4042 12 : if (TREE_CODE (TREE_TYPE (t)) != INTEGER_TYPE
4043 12 : || (TYPE_PRECISION (TREE_TYPE (v))
4044 12 : < TYPE_PRECISION (TREE_TYPE (t))))
4045 : return;
4046 12 : if (!linear_step)
4047 0 : linear_step = 1;
4048 12 : v = t;
4049 12 : continue;
4050 : default:
4051 : return;
4052 : }
4053 12 : else if (gimple_call_internal_p (def_stmt, IFN_GOMP_SIMD_LANE)
4054 12 : && loop->simduid
4055 12 : && TREE_CODE (gimple_call_arg (def_stmt, 0)) == SSA_NAME
4056 24 : && (SSA_NAME_VAR (gimple_call_arg (def_stmt, 0))
4057 : == loop->simduid))
4058 : {
4059 12 : if (!linear_step)
4060 0 : linear_step = 1;
4061 12 : arginfo->linear_step = linear_step;
4062 12 : arginfo->op = base;
4063 12 : arginfo->simd_lane_linear = true;
4064 12 : return;
4065 : }
4066 : }
4067 : }
4068 :
4069 : /* Function vectorizable_simd_clone_call.
4070 :
4071 : Check if STMT_INFO performs a function call that can be vectorized
4072 : by calling a simd clone of the function.
4073 : If COST_VEC is passed, calculate costs but don't change anything,
4074 : otherwise, vectorize STMT_INFO: create a vectorized stmt to replace
4075 : it, and insert it at GSI.
4076 : Return true if STMT_INFO is vectorizable in this way. */
4077 :
4078 : static bool
4079 2657000 : vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
4080 : gimple_stmt_iterator *gsi,
4081 : slp_tree slp_node,
4082 : stmt_vector_for_cost *cost_vec)
4083 : {
4084 2657000 : tree vec_dest;
4085 2657000 : tree scalar_dest;
4086 2657000 : tree vec_oprnd0 = NULL_TREE;
4087 2657000 : tree vectype;
4088 2657000 : poly_uint64 nunits;
4089 2657000 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4090 2657000 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
4091 2657000 : class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
4092 2657000 : tree fndecl, new_temp;
4093 2657000 : int j;
4094 2657000 : auto_vec<simd_call_arg_info> arginfo;
4095 2657000 : vec<tree> vargs = vNULL;
4096 2657000 : size_t i, nargs;
4097 2657000 : tree rtype, ratype;
4098 2657000 : vec<constructor_elt, va_gc> *ret_ctor_elts = NULL;
4099 2657000 : int masked_call_offset = 0;
4100 :
4101 : /* Is STMT a vectorizable call? */
4102 2657000 : gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
4103 15354 : if (!stmt)
4104 : return false;
4105 :
4106 15354 : fndecl = gimple_call_fndecl (stmt);
4107 15354 : if (fndecl == NULL_TREE
4108 15354 : && gimple_call_internal_p (stmt, IFN_MASK_CALL))
4109 : {
4110 220 : fndecl = gimple_call_arg (stmt, 0);
4111 220 : gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
4112 220 : fndecl = TREE_OPERAND (fndecl, 0);
4113 220 : gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
4114 : masked_call_offset = 1;
4115 : }
4116 15134 : if (fndecl == NULL_TREE)
4117 : return false;
4118 :
4119 4921 : struct cgraph_node *node = cgraph_node::get (fndecl);
4120 4921 : if (node == NULL || node->simd_clones == NULL)
4121 : return false;
4122 :
4123 1476 : if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
4124 : return false;
4125 :
4126 1476 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
4127 0 : && cost_vec)
4128 : return false;
4129 :
4130 1476 : if (gimple_call_lhs (stmt)
4131 1476 : && TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
4132 : return false;
4133 :
4134 1476 : gcc_checking_assert (!stmt_can_throw_internal (cfun, stmt));
4135 :
4136 1476 : vectype = SLP_TREE_VECTYPE (slp_node);
4137 :
4138 2657064 : if (loop_vinfo && nested_in_vect_loop_p (loop, stmt_info))
4139 : return false;
4140 :
4141 : /* Process function arguments. */
4142 1476 : nargs = gimple_call_num_args (stmt) - masked_call_offset;
4143 :
4144 : /* Bail out if the function has zero arguments. */
4145 1476 : if (nargs == 0)
4146 : return false;
4147 :
4148 1412 : vect_simd_clone_data _data;
4149 1412 : vect_simd_clone_data &data = slp_node->get_data (_data);
4150 1412 : vec<tree>& simd_clone_info = data.simd_clone_info;
4151 1412 : arginfo.reserve (nargs, true);
4152 1412 : auto_vec<slp_tree> slp_op;
4153 1412 : slp_op.safe_grow_cleared (nargs);
4154 :
4155 4053 : for (i = 0; i < nargs; i++)
4156 : {
4157 2641 : simd_call_arg_info thisarginfo;
4158 2641 : affine_iv iv;
4159 2641 : tree op;
4160 :
4161 2641 : thisarginfo.linear_step = 0;
4162 2641 : thisarginfo.align = 0;
4163 2641 : thisarginfo.op = NULL_TREE;
4164 2641 : thisarginfo.simd_lane_linear = false;
4165 :
4166 5282 : int op_no = vect_slp_child_index_for_operand (stmt_info,
4167 2641 : i + masked_call_offset);
4168 5282 : if (!vect_is_simple_use (vinfo, slp_node,
4169 2641 : op_no, &op, &slp_op[i],
4170 : &thisarginfo.dt, &thisarginfo.vectype)
4171 2641 : || thisarginfo.dt == vect_uninitialized_def)
4172 : {
4173 0 : if (dump_enabled_p ())
4174 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4175 : "use not simple.\n");
4176 0 : return false;
4177 : }
4178 :
4179 2641 : if (thisarginfo.dt == vect_constant_def
4180 2641 : || thisarginfo.dt == vect_external_def)
4181 : {
4182 : /* With SLP we determine the vector type of constants/externals
4183 : at analysis time, handling conflicts via
4184 : vect_maybe_update_slp_op_vectype. At transform time
4185 : we have a vector type recorded for SLP. */
4186 680 : gcc_assert (cost_vec
4187 : || thisarginfo.vectype != NULL_TREE);
4188 : if (cost_vec)
4189 549 : thisarginfo.vectype = get_vectype_for_scalar_type (vinfo,
4190 549 : TREE_TYPE (op),
4191 : slp_node);
4192 : }
4193 : else
4194 1961 : gcc_assert (thisarginfo.vectype != NULL_TREE);
4195 :
4196 : /* For linear arguments, the analyze phase should have saved
4197 : the base and step. */
4198 2510 : if (!cost_vec
4199 1594 : && i * 3 + 4 <= simd_clone_info.length ()
4200 2720 : && simd_clone_info[i * 3 + 2])
4201 : {
4202 118 : thisarginfo.linear_step = tree_to_shwi (simd_clone_info[i * 3 + 2]);
4203 118 : thisarginfo.op = simd_clone_info[i * 3 + 1];
4204 118 : thisarginfo.simd_lane_linear
4205 118 : = (simd_clone_info[i * 3 + 3] == boolean_true_node);
4206 : /* If loop has been peeled for alignment, we need to adjust it. */
4207 118 : tree n1 = LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo);
4208 118 : tree n2 = LOOP_VINFO_NITERS (loop_vinfo);
4209 118 : if (n1 != n2 && !thisarginfo.simd_lane_linear)
4210 : {
4211 0 : tree bias = fold_build2 (MINUS_EXPR, TREE_TYPE (n1), n1, n2);
4212 0 : tree step = simd_clone_info[i * 3 + 2];
4213 0 : tree opt = TREE_TYPE (thisarginfo.op);
4214 0 : bias = fold_convert (TREE_TYPE (step), bias);
4215 0 : bias = fold_build2 (MULT_EXPR, TREE_TYPE (step), bias, step);
4216 0 : thisarginfo.op
4217 0 : = fold_build2 (POINTER_TYPE_P (opt)
4218 : ? POINTER_PLUS_EXPR : PLUS_EXPR, opt,
4219 : thisarginfo.op, bias);
4220 : }
4221 : }
4222 2523 : else if (cost_vec
4223 1844 : && thisarginfo.dt != vect_constant_def
4224 1717 : && thisarginfo.dt != vect_external_def
4225 1295 : && loop_vinfo
4226 1290 : && SLP_TREE_LANES (slp_node) == 1
4227 1266 : && TREE_CODE (op) == SSA_NAME
4228 2532 : && simple_iv (loop, loop_containing_stmt (stmt), op,
4229 : &iv, false)
4230 2735 : && tree_fits_shwi_p (iv.step))
4231 : {
4232 212 : thisarginfo.linear_step = tree_to_shwi (iv.step);
4233 212 : thisarginfo.op = iv.base;
4234 : }
4235 2311 : else if ((thisarginfo.dt == vect_constant_def
4236 2311 : || thisarginfo.dt == vect_external_def)
4237 680 : && SLP_TREE_LANES (slp_node) == 1
4238 2617 : && POINTER_TYPE_P (TREE_TYPE (op)))
4239 86 : thisarginfo.align = get_pointer_alignment (op) / BITS_PER_UNIT;
4240 : /* Addresses of array elements indexed by GOMP_SIMD_LANE are
4241 : linear too. */
4242 2641 : if (SLP_TREE_LANES (slp_node) == 1
4243 2221 : && POINTER_TYPE_P (TREE_TYPE (op))
4244 196 : && !thisarginfo.linear_step
4245 112 : && cost_vec
4246 58 : && thisarginfo.dt != vect_constant_def
4247 58 : && thisarginfo.dt != vect_external_def
4248 15 : && loop_vinfo
4249 2656 : && TREE_CODE (op) == SSA_NAME)
4250 15 : vect_simd_lane_linear (op, loop, &thisarginfo);
4251 :
4252 2641 : if (!vectype)
4253 12 : vectype = thisarginfo.vectype;
4254 2641 : arginfo.quick_push (thisarginfo);
4255 : }
4256 :
4257 1412 : poly_uint64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
4258 1412 : unsigned group_size = SLP_TREE_LANES (slp_node);
4259 1412 : unsigned int badness = 0;
4260 1412 : unsigned int badness_inbranch = 0;
4261 1412 : struct cgraph_node *bestn = NULL;
4262 1412 : struct cgraph_node *bestn_inbranch = NULL;
4263 1412 : if (!cost_vec)
4264 362 : bestn = ((loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4265 362 : ? data.clone_inbranch : data.clone);
4266 : else
4267 6076 : for (struct cgraph_node *n = node->simd_clones; n != NULL;
4268 5026 : n = n->simdclone->next_clone)
4269 : {
4270 5026 : unsigned int this_badness = 0;
4271 5026 : unsigned int num_calls;
4272 : /* The number of arguments in the call and the number of parameters in
4273 : the simdclone should match. However, when the simdclone is
4274 : 'inbranch', it could have one more parameter than nargs when using
4275 : an inbranch simdclone to call a non-inbranch call, either in a
4276 : non-masked loop using a all true constant mask, or inside a masked
4277 : loop using it's mask. */
4278 5026 : size_t simd_nargs = n->simdclone->nargs;
4279 5026 : if (!masked_call_offset && n->simdclone->inbranch)
4280 2271 : simd_nargs--;
4281 5026 : if (!constant_multiple_p (vf * group_size, n->simdclone->simdlen,
4282 : &num_calls)
4283 1974 : || (!n->simdclone->inbranch && (masked_call_offset > 0))
4284 1790 : || (nargs != simd_nargs))
4285 3236 : continue;
4286 1790 : if (num_calls != 1)
4287 1142 : this_badness += floor_log2 (num_calls) * 4096;
4288 1790 : if (n->simdclone->inbranch)
4289 771 : this_badness += 8192;
4290 :
4291 : /* If SLP_TREE_VECTYPE has not been set yet pass the general vector
4292 : mode, which for targets that use it will determine what ISA we can
4293 : vectorize this code with. */
4294 1790 : machine_mode vector_mode = vinfo->vector_mode;
4295 1790 : if (vectype)
4296 1790 : vector_mode = TYPE_MODE (vectype);
4297 1790 : int target_badness = targetm.simd_clone.usable (n, vector_mode);
4298 1790 : if (target_badness < 0)
4299 368 : continue;
4300 1422 : this_badness += target_badness * 512;
4301 4192 : for (i = 0; i < nargs; i++)
4302 : {
4303 3018 : switch (n->simdclone->args[i].arg_type)
4304 : {
4305 2088 : case SIMD_CLONE_ARG_TYPE_VECTOR:
4306 2088 : if (VECTOR_BOOLEAN_TYPE_P (n->simdclone->args[i].vector_type))
4307 : /* Vector mask arguments are not supported. */
4308 : i = -1;
4309 2080 : else if (!useless_type_conversion_p
4310 2080 : (n->simdclone->args[i].orig_type,
4311 2080 : TREE_TYPE (gimple_call_arg (stmt,
4312 : i + masked_call_offset))))
4313 : i = -1;
4314 2080 : else if (arginfo[i].dt == vect_constant_def
4315 1973 : || arginfo[i].dt == vect_external_def
4316 3989 : || arginfo[i].linear_step)
4317 399 : this_badness += 64;
4318 : break;
4319 310 : case SIMD_CLONE_ARG_TYPE_UNIFORM:
4320 310 : if ((arginfo[i].dt != vect_constant_def
4321 145 : && arginfo[i].dt != vect_external_def)
4322 410 : || SLP_TREE_LANES (slp_node) != 1)
4323 : i = -1;
4324 : break;
4325 324 : case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
4326 324 : case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
4327 324 : if (arginfo[i].dt == vect_constant_def
4328 324 : || arginfo[i].dt == vect_external_def
4329 324 : || (arginfo[i].linear_step
4330 324 : != n->simdclone->args[i].linear_step))
4331 : i = -1;
4332 : break;
4333 : case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
4334 : case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
4335 : case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_CONSTANT_STEP:
4336 : case SIMD_CLONE_ARG_TYPE_LINEAR_REF_VARIABLE_STEP:
4337 : case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
4338 : case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_VARIABLE_STEP:
4339 : /* FORNOW */
4340 : i = -1;
4341 : break;
4342 296 : case SIMD_CLONE_ARG_TYPE_MASK:
4343 296 : if (!SCALAR_INT_MODE_P (n->simdclone->mask_mode)
4344 264 : && n->simdclone->mask_mode != VOIDmode)
4345 : i = -1;
4346 : /* While we can create a traditional data vector from
4347 : an incoming integer mode mask we have no good way to
4348 : force generate an integer mode mask from a traditional
4349 : boolean vector input. */
4350 296 : else if (SCALAR_INT_MODE_P (n->simdclone->mask_mode)
4351 296 : && !SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype)))
4352 : i = -1;
4353 290 : else if (n->simdclone->mask_mode == VOIDmode
4354 : /* FORNOW we only have partial support for vector-type
4355 : masks that can't hold all of simdlen. */
4356 554 : && (maybe_ne (TYPE_VECTOR_SUBPARTS (n->simdclone->args[i].vector_type),
4357 264 : TYPE_VECTOR_SUBPARTS (arginfo[i].vectype))
4358 : /* Verify we can compute the mask argument. */
4359 111 : || !expand_vec_cond_expr_p (n->simdclone->args[i].vector_type,
4360 111 : arginfo[i].vectype)))
4361 : i = -1;
4362 125 : else if (SCALAR_INT_MODE_P (n->simdclone->mask_mode)
4363 : /* FORNOW we only have partial support for
4364 : integer-type masks that represent the same number
4365 : of lanes as the vectorized mask inputs. */
4366 151 : && maybe_ne (exact_div (n->simdclone->simdlen,
4367 : n->simdclone->args[i].linear_step),
4368 26 : TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
4369 : i = -1;
4370 107 : else if (!SCALAR_INT_MODE_P (n->simdclone->mask_mode)
4371 107 : && SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype)))
4372 8 : this_badness += 2048;
4373 : break;
4374 : }
4375 183 : if (i == (size_t) -1)
4376 : break;
4377 2770 : if (n->simdclone->args[i].alignment > arginfo[i].align)
4378 : {
4379 : i = -1;
4380 : break;
4381 : }
4382 2770 : if (arginfo[i].align)
4383 110 : this_badness += (exact_log2 (arginfo[i].align)
4384 160 : - exact_log2 (n->simdclone->args[i].alignment));
4385 : }
4386 1422 : if (i == (size_t) -1)
4387 248 : continue;
4388 1174 : if (masked_call_offset == 0
4389 1067 : && n->simdclone->inbranch
4390 347 : && n->simdclone->nargs > nargs)
4391 : {
4392 347 : gcc_assert (n->simdclone->args[n->simdclone->nargs - 1].arg_type ==
4393 : SIMD_CLONE_ARG_TYPE_MASK);
4394 : /* Penalize using a masked SIMD clone in a non-masked loop, that is
4395 : not in a branch, as we'd have to construct an all-true mask. */
4396 347 : this_badness += 64;
4397 : }
4398 1174 : if (bestn == NULL || this_badness < badness)
4399 : {
4400 817 : bestn = n;
4401 817 : badness = this_badness;
4402 : }
4403 1174 : if (n->simdclone->inbranch
4404 454 : && (bestn_inbranch == NULL || this_badness < badness_inbranch))
4405 : {
4406 5026 : bestn_inbranch = n;
4407 5026 : badness_inbranch = this_badness;
4408 : }
4409 : }
4410 :
4411 1412 : if (bestn == NULL)
4412 : return false;
4413 :
4414 829 : fndecl = bestn->decl;
4415 829 : nunits = bestn->simdclone->simdlen;
4416 829 : int ncopies = vector_unroll_factor (vf * group_size, nunits);
4417 :
4418 : /* If the function isn't const, only allow it in simd loops where user
4419 : has asserted that at least nunits consecutive iterations can be
4420 : performed using SIMD instructions. */
4421 824 : if ((loop == NULL || maybe_lt ((unsigned) loop->safelen, nunits))
4422 1006 : && gimple_vuse (stmt))
4423 : return false;
4424 :
4425 : /* ncopies is the number of SIMD clone calls we create, since simdlen
4426 : is not necessarily matching nunits of the vector types used, track
4427 : that in ncopies_in. */
4428 829 : int ncopies_in = vect_get_num_vectors (vf * group_size, vectype);
4429 :
4430 : /* Sanity check: make sure that at least one copy of the vectorized stmt
4431 : needs to be generated. */
4432 829 : gcc_assert (ncopies >= 1);
4433 :
4434 829 : if (cost_vec) /* transformation not required. */
4435 : {
4436 1514 : for (unsigned i = 0; i < nargs; ++i)
4437 1047 : if (!vect_maybe_update_slp_op_vectype (slp_op[i], arginfo[i].vectype))
4438 : {
4439 0 : if (dump_enabled_p ())
4440 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4441 : "incompatible vector types for invariants\n");
4442 0 : return false;
4443 : }
4444 :
4445 467 : if (!bestn_inbranch && loop_vinfo)
4446 : {
4447 248 : if (dump_enabled_p ()
4448 248 : && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
4449 171 : dump_printf_loc (MSG_NOTE, vect_location,
4450 : "can't use a fully-masked loop because no"
4451 : " masked simd clone was available.\n");
4452 248 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
4453 : }
4454 :
4455 : /* When the original call is pure or const but the SIMD ABI dictates
4456 : an aggregate return we will have to use a virtual definition and
4457 : in a loop eventually even need to add a virtual PHI. That's
4458 : not straight-forward so allow to fix this up via renaming. */
4459 467 : if (gimple_call_lhs (stmt)
4460 461 : && !gimple_vdef (stmt)
4461 832 : && TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
4462 27 : vinfo->any_known_not_updated_vssa = true;
4463 : /* ??? For SLP code-gen we end up inserting after the last
4464 : vector argument def rather than at the original call position
4465 : so automagic virtual operand updating doesn't work. */
4466 934 : if (gimple_vuse (stmt))
4467 139 : vinfo->any_known_not_updated_vssa = true;
4468 :
4469 467 : data.clone = bestn;
4470 467 : data.clone_inbranch = bestn_inbranch;
4471 :
4472 467 : simd_clone_info.safe_push (NULL_TREE);
4473 1663 : for (i = 0;
4474 2502 : i < (bestn_inbranch ? bestn_inbranch : bestn)->simdclone->nargs; i++)
4475 : {
4476 1196 : if (loop_vinfo
4477 1190 : && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
4478 482 : && (bestn_inbranch->simdclone->args[i].arg_type
4479 : == SIMD_CLONE_ARG_TYPE_MASK))
4480 : {
4481 174 : if (masked_call_offset)
4482 : /* When there is an explicit mask we require the
4483 : number of elements to match up. */
4484 49 : vect_record_loop_mask (loop_vinfo,
4485 : &LOOP_VINFO_MASKS (loop_vinfo),
4486 : ncopies_in, vectype, NULL_TREE);
4487 : else
4488 : {
4489 : /* When there is no explicit mask on the call we have
4490 : more relaxed requirements. */
4491 125 : tree masktype;
4492 125 : poly_uint64 callee_nelements;
4493 125 : if (SCALAR_INT_MODE_P (bestn_inbranch->simdclone->mask_mode))
4494 : {
4495 12 : callee_nelements
4496 12 : = exact_div (bestn_inbranch->simdclone->simdlen,
4497 : bestn_inbranch->simdclone->args[i].linear_step);
4498 12 : masktype = get_related_vectype_for_scalar_type
4499 12 : (vinfo->vector_mode, TREE_TYPE (vectype),
4500 : callee_nelements);
4501 : }
4502 : else
4503 : {
4504 113 : masktype = bestn_inbranch->simdclone->args[i].vector_type;
4505 : /* The aarch64 port will add custom attributes to types
4506 : for SVE simdclones which make the types different. We
4507 : should use canonincal types for masks within the
4508 : vectorizer, hence we construct the related vectype
4509 : here. */
4510 113 : masktype
4511 : = build_truth_vector_type_for_mode
4512 113 : (TYPE_VECTOR_SUBPARTS (masktype),
4513 113 : TYPE_MODE (masktype));
4514 113 : callee_nelements = TYPE_VECTOR_SUBPARTS (masktype);
4515 : }
4516 125 : auto o = vector_unroll_factor (nunits, callee_nelements);
4517 125 : vect_record_loop_mask (loop_vinfo,
4518 : &LOOP_VINFO_MASKS (loop_vinfo),
4519 : ncopies * o, masktype, NULL_TREE);
4520 : }
4521 : }
4522 1022 : else if ((bestn->simdclone->args[i].arg_type
4523 : == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP)
4524 915 : || (bestn->simdclone->args[i].arg_type
4525 : == SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP)
4526 904 : || (bestn_inbranch
4527 364 : && ((bestn_inbranch->simdclone->args[i].arg_type
4528 : == SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP)
4529 364 : || (bestn_inbranch->simdclone->args[i].arg_type
4530 : == SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP))))
4531 : {
4532 118 : simd_clone_info.safe_grow_cleared (i * 3 + 1, true);
4533 118 : simd_clone_info.safe_push (arginfo[i].op);
4534 202 : tree lst = (POINTER_TYPE_P (TREE_TYPE (arginfo[i].op))
4535 202 : ? size_type_node : TREE_TYPE (arginfo[i].op));
4536 118 : tree ls = build_int_cst (lst, arginfo[i].linear_step);
4537 118 : simd_clone_info.safe_push (ls);
4538 118 : tree sll = (arginfo[i].simd_lane_linear
4539 118 : ? boolean_true_node : boolean_false_node);
4540 118 : simd_clone_info.safe_push (sll);
4541 : }
4542 : }
4543 :
4544 467 : SLP_TREE_TYPE (slp_node) = call_simd_clone_vec_info_type;
4545 467 : slp_node->data = new vect_simd_clone_data (std::move (_data));
4546 467 : DUMP_VECT_SCOPE ("vectorizable_simd_clone_call");
4547 : /* ??? We're confused by calls w/o LHS. */
4548 467 : if (SLP_TREE_VECTYPE (slp_node))
4549 461 : vect_model_simple_cost (vinfo, ncopies, slp_node, cost_vec);
4550 467 : return true;
4551 : }
4552 :
4553 : /* Transform. */
4554 :
4555 362 : if (dump_enabled_p ())
4556 246 : dump_printf_loc (MSG_NOTE, vect_location, "transform call.\n");
4557 :
4558 : /* Handle def. */
4559 362 : scalar_dest = gimple_call_lhs (stmt);
4560 362 : vec_dest = NULL_TREE;
4561 362 : rtype = NULL_TREE;
4562 362 : ratype = NULL_TREE;
4563 362 : if (scalar_dest)
4564 : {
4565 356 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
4566 356 : rtype = TREE_TYPE (TREE_TYPE (fndecl));
4567 356 : if (TREE_CODE (rtype) == ARRAY_TYPE)
4568 : {
4569 9 : ratype = rtype;
4570 9 : rtype = TREE_TYPE (ratype);
4571 : }
4572 : }
4573 :
4574 724 : auto_vec<vec<tree> > vec_oprnds;
4575 362 : auto_vec<unsigned> vec_oprnds_i;
4576 362 : vec_oprnds_i.safe_grow_cleared (nargs, true);
4577 362 : vec_oprnds.reserve_exact (nargs);
4578 362 : vect_get_slp_defs (vinfo, slp_node, &vec_oprnds);
4579 833 : for (j = 0; j < ncopies; ++j)
4580 : {
4581 471 : poly_uint64 callee_nelements;
4582 471 : poly_uint64 caller_nelements;
4583 : /* Build argument list for the vectorized call. */
4584 471 : if (j == 0)
4585 362 : vargs.create (nargs);
4586 : else
4587 109 : vargs.truncate (0);
4588 :
4589 1580 : for (i = 0; i < nargs; i++)
4590 : {
4591 1109 : unsigned int k, l, m, o;
4592 1109 : tree atype;
4593 1109 : tree op = gimple_call_arg (stmt, i + masked_call_offset);
4594 1109 : switch (bestn->simdclone->args[i].arg_type)
4595 : {
4596 820 : case SIMD_CLONE_ARG_TYPE_VECTOR:
4597 820 : atype = bestn->simdclone->args[i].vector_type;
4598 820 : caller_nelements = TYPE_VECTOR_SUBPARTS (arginfo[i].vectype);
4599 820 : callee_nelements = TYPE_VECTOR_SUBPARTS (atype);
4600 820 : o = vector_unroll_factor (nunits, callee_nelements);
4601 1870 : for (m = j * o; m < (j + 1) * o; m++)
4602 : {
4603 1050 : if (known_lt (callee_nelements, caller_nelements))
4604 : {
4605 516 : poly_uint64 prec = GET_MODE_BITSIZE (TYPE_MODE (atype));
4606 258 : if (!constant_multiple_p (caller_nelements,
4607 : callee_nelements, &k))
4608 0 : gcc_unreachable ();
4609 :
4610 258 : gcc_assert ((k & (k - 1)) == 0);
4611 258 : if (m == 0)
4612 : {
4613 57 : vec_oprnds_i[i] = 0;
4614 57 : vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4615 : }
4616 : else
4617 : {
4618 201 : vec_oprnd0 = arginfo[i].op;
4619 201 : if ((m & (k - 1)) == 0)
4620 72 : vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4621 : }
4622 258 : arginfo[i].op = vec_oprnd0;
4623 258 : vec_oprnd0
4624 258 : = build3 (BIT_FIELD_REF, atype, vec_oprnd0,
4625 258 : bitsize_int (prec),
4626 258 : bitsize_int ((m & (k - 1)) * prec));
4627 258 : gassign *new_stmt
4628 258 : = gimple_build_assign (make_ssa_name (atype),
4629 : vec_oprnd0);
4630 258 : vect_finish_stmt_generation (vinfo, stmt_info,
4631 : new_stmt, gsi);
4632 258 : vargs.safe_push (gimple_assign_lhs (new_stmt));
4633 : }
4634 : else
4635 : {
4636 792 : if (!constant_multiple_p (callee_nelements,
4637 : caller_nelements, &k))
4638 0 : gcc_unreachable ();
4639 792 : gcc_assert ((k & (k - 1)) == 0);
4640 792 : vec<constructor_elt, va_gc> *ctor_elts;
4641 792 : if (k != 1)
4642 14 : vec_alloc (ctor_elts, k);
4643 : else
4644 778 : ctor_elts = NULL;
4645 820 : for (l = 0; l < k; l++)
4646 : {
4647 806 : if (m == 0 && l == 0)
4648 : {
4649 454 : vec_oprnds_i[i] = 0;
4650 454 : vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4651 : }
4652 : else
4653 352 : vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4654 806 : arginfo[i].op = vec_oprnd0;
4655 806 : if (k == 1)
4656 : break;
4657 28 : CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE,
4658 : vec_oprnd0);
4659 : }
4660 792 : if (k == 1)
4661 778 : if (!useless_type_conversion_p (TREE_TYPE (vec_oprnd0),
4662 : atype))
4663 : {
4664 0 : vec_oprnd0 = build1 (VIEW_CONVERT_EXPR, atype,
4665 : vec_oprnd0);
4666 0 : gassign *new_stmt
4667 0 : = gimple_build_assign (make_ssa_name (atype),
4668 : vec_oprnd0);
4669 0 : vect_finish_stmt_generation (vinfo, stmt_info,
4670 : new_stmt, gsi);
4671 0 : vargs.safe_push (gimple_get_lhs (new_stmt));
4672 : }
4673 : else
4674 778 : vargs.safe_push (vec_oprnd0);
4675 : else
4676 : {
4677 14 : vec_oprnd0 = build_constructor (atype, ctor_elts);
4678 14 : gassign *new_stmt
4679 14 : = gimple_build_assign (make_ssa_name (atype),
4680 : vec_oprnd0);
4681 14 : vect_finish_stmt_generation (vinfo, stmt_info,
4682 : new_stmt, gsi);
4683 14 : vargs.safe_push (gimple_assign_lhs (new_stmt));
4684 : }
4685 : }
4686 : }
4687 : break;
4688 66 : case SIMD_CLONE_ARG_TYPE_MASK:
4689 66 : if (bestn->simdclone->mask_mode == VOIDmode)
4690 : {
4691 60 : atype = bestn->simdclone->args[i].vector_type;
4692 60 : tree elt_type = TREE_TYPE (atype);
4693 60 : tree one = fold_convert (elt_type, integer_one_node);
4694 60 : tree zero = fold_convert (elt_type, integer_zero_node);
4695 60 : callee_nelements = TYPE_VECTOR_SUBPARTS (atype);
4696 60 : caller_nelements = TYPE_VECTOR_SUBPARTS (arginfo[i].vectype);
4697 60 : o = vector_unroll_factor (nunits, callee_nelements);
4698 120 : for (m = j * o; m < (j + 1) * o; m++)
4699 : {
4700 60 : if (maybe_lt (callee_nelements, caller_nelements))
4701 : {
4702 : /* The mask type has fewer elements than simdlen. */
4703 :
4704 : /* FORNOW */
4705 0 : gcc_unreachable ();
4706 : }
4707 60 : else if (known_eq (callee_nelements, caller_nelements))
4708 : {
4709 : /* The SIMD clone function has the same number of
4710 : elements as the current function. */
4711 60 : if (m == 0)
4712 60 : vec_oprnds_i[i] = 0;
4713 60 : vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4714 60 : if (loop_vinfo
4715 60 : && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4716 : {
4717 0 : vec_loop_masks *loop_masks
4718 : = &LOOP_VINFO_MASKS (loop_vinfo);
4719 0 : tree loop_mask
4720 0 : = vect_get_loop_mask (loop_vinfo, gsi,
4721 : loop_masks, ncopies_in,
4722 0 : vectype, j);
4723 0 : vec_oprnd0
4724 0 : = prepare_vec_mask (loop_vinfo,
4725 0 : TREE_TYPE (loop_mask),
4726 : loop_mask, vec_oprnd0,
4727 : gsi);
4728 0 : loop_vinfo->vec_cond_masked_set.add ({ vec_oprnd0,
4729 : loop_mask });
4730 :
4731 : }
4732 60 : vec_oprnd0
4733 60 : = build3 (VEC_COND_EXPR, atype, vec_oprnd0,
4734 : build_vector_from_val (atype, one),
4735 : build_vector_from_val (atype, zero));
4736 60 : gassign *new_stmt
4737 60 : = gimple_build_assign (make_ssa_name (atype),
4738 : vec_oprnd0);
4739 60 : vect_finish_stmt_generation (vinfo, stmt_info,
4740 : new_stmt, gsi);
4741 60 : vargs.safe_push (gimple_assign_lhs (new_stmt));
4742 : }
4743 : else
4744 : {
4745 : /* The mask type has more elements than simdlen. */
4746 :
4747 : /* FORNOW */
4748 0 : gcc_unreachable ();
4749 : }
4750 : }
4751 : }
4752 6 : else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
4753 : {
4754 6 : atype = bestn->simdclone->args[i].vector_type;
4755 6 : poly_uint64 atype_subparts
4756 6 : = exact_div (bestn->simdclone->simdlen,
4757 : bestn->simdclone->args[i].linear_step);
4758 6 : o = bestn->simdclone->args[i].linear_step;
4759 12 : for (m = j * o; m < (j + 1) * o; m++)
4760 : {
4761 6 : if (m == 0)
4762 6 : vec_oprnds_i[i] = 0;
4763 6 : if (maybe_lt (atype_subparts,
4764 6 : TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
4765 : {
4766 : /* The mask argument has fewer elements than the
4767 : input vector. */
4768 : /* FORNOW */
4769 0 : gcc_unreachable ();
4770 : }
4771 6 : else if (known_eq (atype_subparts,
4772 : TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
4773 : {
4774 6 : vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
4775 6 : if (loop_vinfo
4776 6 : && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4777 : {
4778 1 : vec_loop_masks *loop_masks
4779 : = &LOOP_VINFO_MASKS (loop_vinfo);
4780 1 : tree loop_mask
4781 1 : = vect_get_loop_mask (loop_vinfo, gsi,
4782 : loop_masks, ncopies_in,
4783 : vectype, j);
4784 1 : vec_oprnd0
4785 1 : = prepare_vec_mask (loop_vinfo,
4786 1 : TREE_TYPE (loop_mask),
4787 : loop_mask, vec_oprnd0,
4788 : gsi);
4789 : }
4790 : /* The vector mask argument matches the input
4791 : in the number of lanes, but not necessarily
4792 : in the mode. */
4793 6 : tree st = lang_hooks.types.type_for_mode
4794 6 : (TYPE_MODE (TREE_TYPE (vec_oprnd0)), 1);
4795 6 : vec_oprnd0 = build1 (VIEW_CONVERT_EXPR, st,
4796 : vec_oprnd0);
4797 6 : gassign *new_stmt
4798 6 : = gimple_build_assign (make_ssa_name (st),
4799 : vec_oprnd0);
4800 6 : vect_finish_stmt_generation (vinfo, stmt_info,
4801 : new_stmt, gsi);
4802 6 : if (!types_compatible_p (atype, st))
4803 : {
4804 6 : new_stmt
4805 6 : = gimple_build_assign (make_ssa_name (atype),
4806 : NOP_EXPR,
4807 : gimple_assign_lhs
4808 : (new_stmt));
4809 6 : vect_finish_stmt_generation (vinfo, stmt_info,
4810 : new_stmt, gsi);
4811 : }
4812 6 : vargs.safe_push (gimple_assign_lhs (new_stmt));
4813 : }
4814 : else
4815 : {
4816 : /* The mask argument has more elements than the
4817 : input vector. */
4818 : /* FORNOW */
4819 0 : gcc_unreachable ();
4820 : }
4821 : }
4822 : }
4823 : else
4824 0 : gcc_unreachable ();
4825 : break;
4826 102 : case SIMD_CLONE_ARG_TYPE_UNIFORM:
4827 102 : vargs.safe_push (op);
4828 102 : break;
4829 121 : case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
4830 121 : case SIMD_CLONE_ARG_TYPE_LINEAR_REF_CONSTANT_STEP:
4831 121 : if (j == 0)
4832 : {
4833 118 : gimple_seq stmts;
4834 118 : arginfo[i].op
4835 118 : = force_gimple_operand (unshare_expr (arginfo[i].op),
4836 : &stmts, true, NULL_TREE);
4837 118 : if (stmts != NULL)
4838 : {
4839 0 : basic_block new_bb;
4840 0 : edge pe = loop_preheader_edge (loop);
4841 0 : new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4842 0 : gcc_assert (!new_bb);
4843 : }
4844 118 : if (arginfo[i].simd_lane_linear)
4845 : {
4846 6 : vargs.safe_push (arginfo[i].op);
4847 6 : break;
4848 : }
4849 112 : tree phi_res = copy_ssa_name (op);
4850 112 : gphi *new_phi = create_phi_node (phi_res, loop->header);
4851 112 : add_phi_arg (new_phi, arginfo[i].op,
4852 : loop_preheader_edge (loop), UNKNOWN_LOCATION);
4853 112 : enum tree_code code
4854 196 : = POINTER_TYPE_P (TREE_TYPE (op))
4855 112 : ? POINTER_PLUS_EXPR : PLUS_EXPR;
4856 196 : tree type = POINTER_TYPE_P (TREE_TYPE (op))
4857 196 : ? sizetype : TREE_TYPE (op);
4858 112 : poly_widest_int cst
4859 112 : = wi::mul (bestn->simdclone->args[i].linear_step,
4860 112 : ncopies * nunits);
4861 112 : tree tcst = wide_int_to_tree (type, cst);
4862 112 : tree phi_arg = copy_ssa_name (op);
4863 112 : gassign *new_stmt
4864 112 : = gimple_build_assign (phi_arg, code, phi_res, tcst);
4865 112 : gimple_stmt_iterator si = gsi_after_labels (loop->header);
4866 112 : gsi_insert_after (&si, new_stmt, GSI_NEW_STMT);
4867 112 : add_phi_arg (new_phi, phi_arg, loop_latch_edge (loop),
4868 : UNKNOWN_LOCATION);
4869 112 : arginfo[i].op = phi_res;
4870 112 : vargs.safe_push (phi_res);
4871 112 : }
4872 : else
4873 : {
4874 3 : enum tree_code code
4875 6 : = POINTER_TYPE_P (TREE_TYPE (op))
4876 3 : ? POINTER_PLUS_EXPR : PLUS_EXPR;
4877 6 : tree type = POINTER_TYPE_P (TREE_TYPE (op))
4878 6 : ? sizetype : TREE_TYPE (op);
4879 3 : poly_widest_int cst
4880 3 : = wi::mul (bestn->simdclone->args[i].linear_step,
4881 3 : j * nunits);
4882 3 : tree tcst = wide_int_to_tree (type, cst);
4883 3 : new_temp = make_ssa_name (TREE_TYPE (op));
4884 3 : gassign *new_stmt
4885 6 : = gimple_build_assign (new_temp, code,
4886 3 : arginfo[i].op, tcst);
4887 3 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
4888 3 : vargs.safe_push (new_temp);
4889 3 : }
4890 : break;
4891 0 : case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
4892 0 : case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_CONSTANT_STEP:
4893 0 : case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
4894 0 : case SIMD_CLONE_ARG_TYPE_LINEAR_REF_VARIABLE_STEP:
4895 0 : case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
4896 0 : case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_VARIABLE_STEP:
4897 0 : default:
4898 0 : gcc_unreachable ();
4899 : }
4900 : }
4901 :
4902 471 : if (masked_call_offset == 0
4903 405 : && bestn->simdclone->inbranch
4904 13 : && bestn->simdclone->nargs > nargs)
4905 : {
4906 13 : unsigned long m, o;
4907 13 : size_t mask_i = bestn->simdclone->nargs - 1;
4908 13 : tree mask;
4909 13 : gcc_assert (bestn->simdclone->args[mask_i].arg_type ==
4910 : SIMD_CLONE_ARG_TYPE_MASK);
4911 :
4912 13 : tree mask_argtype = bestn->simdclone->args[mask_i].vector_type;
4913 13 : tree mask_vectype;
4914 13 : if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
4915 : {
4916 2 : callee_nelements = exact_div (bestn->simdclone->simdlen,
4917 : bestn->simdclone->args[i].linear_step);
4918 2 : mask_vectype = get_related_vectype_for_scalar_type
4919 2 : (vinfo->vector_mode, TREE_TYPE (vectype), callee_nelements);
4920 : }
4921 : else
4922 : {
4923 11 : mask_vectype = mask_argtype;
4924 11 : callee_nelements = TYPE_VECTOR_SUBPARTS (mask_vectype);
4925 : }
4926 13 : o = vector_unroll_factor (nunits, callee_nelements);
4927 26 : for (m = j * o; m < (j + 1) * o; m++)
4928 : {
4929 13 : if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4930 : {
4931 1 : vec_loop_masks *loop_masks = &LOOP_VINFO_MASKS (loop_vinfo);
4932 1 : mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
4933 : ncopies * o, mask_vectype, m);
4934 : }
4935 : else
4936 12 : mask = vect_build_all_ones_mask (vinfo, stmt_info,
4937 : mask_argtype);
4938 :
4939 13 : gassign *new_stmt;
4940 13 : if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
4941 : {
4942 : /* This means we are dealing with integer mask modes.
4943 : First convert to an integer type with the same size as
4944 : the current vector type. */
4945 2 : unsigned HOST_WIDE_INT intermediate_size
4946 2 : = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (mask)));
4947 2 : tree mid_int_type =
4948 2 : build_nonstandard_integer_type (intermediate_size, 1);
4949 2 : mask = build1 (VIEW_CONVERT_EXPR, mid_int_type, mask);
4950 2 : new_stmt
4951 2 : = gimple_build_assign (make_ssa_name (mid_int_type),
4952 : mask);
4953 2 : gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4954 : /* Then zero-extend to the mask mode. */
4955 2 : mask = fold_build1 (NOP_EXPR, mask_argtype,
4956 : gimple_get_lhs (new_stmt));
4957 : }
4958 11 : else if (bestn->simdclone->mask_mode == VOIDmode)
4959 11 : mask = build3 (VEC_COND_EXPR, mask_argtype, mask,
4960 : build_one_cst (mask_argtype),
4961 : build_zero_cst (mask_argtype));
4962 : else
4963 0 : gcc_unreachable ();
4964 :
4965 13 : new_stmt = gimple_build_assign (make_ssa_name (mask_argtype),
4966 : mask);
4967 13 : vect_finish_stmt_generation (vinfo, stmt_info,
4968 : new_stmt, gsi);
4969 13 : mask = gimple_assign_lhs (new_stmt);
4970 13 : vargs.safe_push (mask);
4971 : }
4972 : }
4973 :
4974 471 : gcall *new_call = gimple_build_call_vec (fndecl, vargs);
4975 471 : if (vec_dest)
4976 : {
4977 465 : gcc_assert (ratype
4978 : || known_eq (TYPE_VECTOR_SUBPARTS (rtype), nunits));
4979 465 : if (ratype)
4980 15 : new_temp = create_tmp_var (ratype);
4981 450 : else if (useless_type_conversion_p (vectype, rtype))
4982 428 : new_temp = make_ssa_name (vec_dest, new_call);
4983 : else
4984 22 : new_temp = make_ssa_name (rtype, new_call);
4985 465 : gimple_call_set_lhs (new_call, new_temp);
4986 : }
4987 471 : vect_finish_stmt_generation (vinfo, stmt_info, new_call, gsi);
4988 471 : gimple *new_stmt = new_call;
4989 :
4990 471 : if (vec_dest)
4991 : {
4992 465 : if (!multiple_p (TYPE_VECTOR_SUBPARTS (vectype), nunits))
4993 : {
4994 21 : unsigned int k, l;
4995 42 : poly_uint64 prec = GET_MODE_BITSIZE (TYPE_MODE (vectype));
4996 42 : poly_uint64 bytes = GET_MODE_SIZE (TYPE_MODE (vectype));
4997 21 : k = vector_unroll_factor (nunits,
4998 : TYPE_VECTOR_SUBPARTS (vectype));
4999 21 : gcc_assert ((k & (k - 1)) == 0);
5000 75 : for (l = 0; l < k; l++)
5001 : {
5002 54 : tree t;
5003 54 : if (ratype)
5004 : {
5005 42 : t = build_fold_addr_expr (new_temp);
5006 42 : t = build2 (MEM_REF, vectype, t,
5007 42 : build_int_cst (TREE_TYPE (t), l * bytes));
5008 : }
5009 : else
5010 12 : t = build3 (BIT_FIELD_REF, vectype, new_temp,
5011 12 : bitsize_int (prec), bitsize_int (l * prec));
5012 54 : new_stmt = gimple_build_assign (make_ssa_name (vectype), t);
5013 54 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5014 :
5015 54 : SLP_TREE_VEC_DEFS (slp_node)
5016 54 : .quick_push (gimple_assign_lhs (new_stmt));
5017 : }
5018 :
5019 21 : if (ratype)
5020 15 : vect_clobber_variable (vinfo, stmt_info, gsi, new_temp);
5021 21 : continue;
5022 21 : }
5023 444 : else if (!multiple_p (nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5024 : {
5025 16 : unsigned int k;
5026 16 : if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
5027 16 : TYPE_VECTOR_SUBPARTS (rtype), &k))
5028 0 : gcc_unreachable ();
5029 16 : gcc_assert ((k & (k - 1)) == 0);
5030 16 : if ((j & (k - 1)) == 0)
5031 8 : vec_alloc (ret_ctor_elts, k);
5032 16 : if (ratype)
5033 : {
5034 0 : unsigned int m, o;
5035 0 : o = vector_unroll_factor (nunits,
5036 : TYPE_VECTOR_SUBPARTS (rtype));
5037 0 : for (m = 0; m < o; m++)
5038 : {
5039 0 : tree tem = build4 (ARRAY_REF, rtype, new_temp,
5040 0 : size_int (m), NULL_TREE, NULL_TREE);
5041 0 : new_stmt = gimple_build_assign (make_ssa_name (rtype),
5042 : tem);
5043 0 : vect_finish_stmt_generation (vinfo, stmt_info,
5044 : new_stmt, gsi);
5045 0 : CONSTRUCTOR_APPEND_ELT (ret_ctor_elts, NULL_TREE,
5046 : gimple_assign_lhs (new_stmt));
5047 : }
5048 0 : vect_clobber_variable (vinfo, stmt_info, gsi, new_temp);
5049 : }
5050 : else
5051 16 : CONSTRUCTOR_APPEND_ELT (ret_ctor_elts, NULL_TREE, new_temp);
5052 16 : if ((j & (k - 1)) != k - 1)
5053 8 : continue;
5054 8 : vec_oprnd0 = build_constructor (vectype, ret_ctor_elts);
5055 8 : new_stmt
5056 8 : = gimple_build_assign (make_ssa_name (vec_dest), vec_oprnd0);
5057 8 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5058 :
5059 8 : SLP_TREE_VEC_DEFS (slp_node)
5060 8 : .quick_push (gimple_assign_lhs (new_stmt));
5061 8 : continue;
5062 8 : }
5063 428 : else if (ratype)
5064 : {
5065 0 : tree t = build_fold_addr_expr (new_temp);
5066 0 : t = build2 (MEM_REF, vectype, t,
5067 0 : build_int_cst (TREE_TYPE (t), 0));
5068 0 : new_stmt = gimple_build_assign (make_ssa_name (vec_dest), t);
5069 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5070 0 : vect_clobber_variable (vinfo, stmt_info, gsi, new_temp);
5071 : }
5072 428 : else if (!useless_type_conversion_p (vectype, rtype))
5073 : {
5074 0 : vec_oprnd0 = build1 (VIEW_CONVERT_EXPR, vectype, new_temp);
5075 0 : new_stmt
5076 0 : = gimple_build_assign (make_ssa_name (vec_dest), vec_oprnd0);
5077 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5078 : }
5079 : }
5080 :
5081 434 : if (gimple_get_lhs (new_stmt))
5082 428 : SLP_TREE_VEC_DEFS (slp_node).quick_push (gimple_get_lhs (new_stmt));
5083 : }
5084 :
5085 1159 : for (i = 0; i < nargs; ++i)
5086 : {
5087 797 : vec<tree> oprndsi = vec_oprnds[i];
5088 797 : oprndsi.release ();
5089 : }
5090 362 : vargs.release ();
5091 :
5092 : /* Mark the clone as no longer being a candidate for GC. */
5093 362 : bestn->gc_candidate = false;
5094 :
5095 362 : return true;
5096 1412 : }
5097 :
5098 :
5099 : /* Function vect_gen_widened_results_half
5100 :
5101 : Create a vector stmt whose code, type, number of arguments, and result
5102 : variable are CODE, OP_TYPE, and VEC_DEST, and its arguments are
5103 : VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at GSI.
5104 : In the case that CODE is a CALL_EXPR, this means that a call to DECL
5105 : needs to be created (DECL is a function-decl of a target-builtin).
5106 : STMT_INFO is the original scalar stmt that we are vectorizing. */
5107 :
5108 : static gimple *
5109 31908 : vect_gen_widened_results_half (vec_info *vinfo, code_helper ch,
5110 : tree vec_oprnd0, tree vec_oprnd1, int op_type,
5111 : tree vec_dest, gimple_stmt_iterator *gsi,
5112 : stmt_vec_info stmt_info)
5113 : {
5114 31908 : gimple *new_stmt;
5115 31908 : tree new_temp;
5116 :
5117 : /* Generate half of the widened result: */
5118 31908 : if (op_type != binary_op)
5119 30798 : vec_oprnd1 = NULL;
5120 31908 : new_stmt = vect_gimple_build (vec_dest, ch, vec_oprnd0, vec_oprnd1);
5121 31908 : new_temp = make_ssa_name (vec_dest, new_stmt);
5122 31908 : gimple_set_lhs (new_stmt, new_temp);
5123 31908 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5124 :
5125 31908 : return new_stmt;
5126 : }
5127 :
5128 :
5129 : /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
5130 : For multi-step conversions store the resulting vectors and call the function
5131 : recursively. When NARROW_SRC_P is true, there's still a conversion after
5132 : narrowing, don't store the vectors in the SLP_NODE or in vector info of
5133 : the scalar statement(or in STMT_VINFO_RELATED_STMT chain). */
5134 :
5135 : static void
5136 12052 : vect_create_vectorized_demotion_stmts (vec_info *vinfo, vec<tree> *vec_oprnds,
5137 : int multi_step_cvt,
5138 : stmt_vec_info stmt_info,
5139 : vec<tree> &vec_dsts,
5140 : gimple_stmt_iterator *gsi,
5141 : slp_tree slp_node, code_helper code,
5142 : bool narrow_src_p)
5143 : {
5144 12052 : unsigned int i;
5145 12052 : tree vop0, vop1, new_tmp, vec_dest;
5146 :
5147 12052 : vec_dest = vec_dsts.pop ();
5148 :
5149 28513 : for (i = 0; i < vec_oprnds->length (); i += 2)
5150 : {
5151 : /* Create demotion operation. */
5152 16461 : vop0 = (*vec_oprnds)[i];
5153 16461 : vop1 = (*vec_oprnds)[i + 1];
5154 16461 : gimple *new_stmt = vect_gimple_build (vec_dest, code, vop0, vop1);
5155 16461 : new_tmp = make_ssa_name (vec_dest, new_stmt);
5156 16461 : gimple_set_lhs (new_stmt, new_tmp);
5157 16461 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5158 16461 : if (multi_step_cvt || narrow_src_p)
5159 : /* Store the resulting vector for next recursive call,
5160 : or return the resulting vector_tmp for NARROW FLOAT_EXPR. */
5161 6745 : (*vec_oprnds)[i/2] = new_tmp;
5162 : else
5163 : {
5164 : /* This is the last step of the conversion sequence. Store the
5165 : vectors in SLP_NODE. */
5166 9716 : slp_node->push_vec_def (new_stmt);
5167 : }
5168 : }
5169 :
5170 : /* For multi-step demotion operations we first generate demotion operations
5171 : from the source type to the intermediate types, and then combine the
5172 : results (stored in VEC_OPRNDS) in demotion operation to the destination
5173 : type. */
5174 12052 : if (multi_step_cvt)
5175 : {
5176 : /* At each level of recursion we have half of the operands we had at the
5177 : previous level. */
5178 3000 : vec_oprnds->truncate ((i+1)/2);
5179 3000 : vect_create_vectorized_demotion_stmts (vinfo, vec_oprnds,
5180 : multi_step_cvt - 1,
5181 : stmt_info, vec_dsts, gsi,
5182 3000 : slp_node, VEC_PACK_TRUNC_EXPR,
5183 : narrow_src_p);
5184 : }
5185 :
5186 12052 : vec_dsts.quick_push (vec_dest);
5187 12052 : }
5188 :
5189 :
5190 : /* Create vectorized promotion statements for vector operands from VEC_OPRNDS0
5191 : and VEC_OPRNDS1, for a binary operation associated with scalar statement
5192 : STMT_INFO. For multi-step conversions store the resulting vectors and
5193 : call the function recursively. */
5194 :
5195 : static void
5196 11626 : vect_create_vectorized_promotion_stmts (vec_info *vinfo,
5197 : vec<tree> *vec_oprnds0,
5198 : vec<tree> *vec_oprnds1,
5199 : stmt_vec_info stmt_info, tree vec_dest,
5200 : gimple_stmt_iterator *gsi,
5201 : code_helper ch1,
5202 : code_helper ch2, int op_type)
5203 : {
5204 11626 : int i;
5205 11626 : tree vop0, vop1, new_tmp1, new_tmp2;
5206 11626 : gimple *new_stmt1, *new_stmt2;
5207 11626 : vec<tree> vec_tmp = vNULL;
5208 :
5209 11626 : vec_tmp.create (vec_oprnds0->length () * 2);
5210 39206 : FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
5211 : {
5212 15954 : if (op_type == binary_op)
5213 555 : vop1 = (*vec_oprnds1)[i];
5214 : else
5215 : vop1 = NULL_TREE;
5216 :
5217 : /* Generate the two halves of promotion operation. */
5218 15954 : new_stmt1 = vect_gen_widened_results_half (vinfo, ch1, vop0, vop1,
5219 : op_type, vec_dest, gsi,
5220 : stmt_info);
5221 15954 : new_stmt2 = vect_gen_widened_results_half (vinfo, ch2, vop0, vop1,
5222 : op_type, vec_dest, gsi,
5223 : stmt_info);
5224 15954 : if (is_gimple_call (new_stmt1))
5225 : {
5226 0 : new_tmp1 = gimple_call_lhs (new_stmt1);
5227 0 : new_tmp2 = gimple_call_lhs (new_stmt2);
5228 : }
5229 : else
5230 : {
5231 15954 : new_tmp1 = gimple_assign_lhs (new_stmt1);
5232 15954 : new_tmp2 = gimple_assign_lhs (new_stmt2);
5233 : }
5234 :
5235 : /* Store the results for the next step. */
5236 15954 : vec_tmp.quick_push (new_tmp1);
5237 15954 : vec_tmp.quick_push (new_tmp2);
5238 : }
5239 :
5240 11626 : vec_oprnds0->release ();
5241 11626 : *vec_oprnds0 = vec_tmp;
5242 11626 : }
5243 :
5244 : /* Create vectorized promotion stmts for widening stmts using only half the
5245 : potential vector size for input. */
5246 : static void
5247 14 : vect_create_half_widening_stmts (vec_info *vinfo,
5248 : vec<tree> *vec_oprnds0,
5249 : vec<tree> *vec_oprnds1,
5250 : stmt_vec_info stmt_info, tree vec_dest,
5251 : gimple_stmt_iterator *gsi,
5252 : code_helper code1,
5253 : int op_type)
5254 : {
5255 14 : int i;
5256 14 : tree vop0, vop1;
5257 14 : gimple *new_stmt1;
5258 14 : gimple *new_stmt2;
5259 14 : gimple *new_stmt3;
5260 14 : vec<tree> vec_tmp = vNULL;
5261 :
5262 14 : vec_tmp.create (vec_oprnds0->length ());
5263 28 : FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
5264 : {
5265 14 : tree new_tmp1, new_tmp2, new_tmp3, out_type;
5266 :
5267 14 : gcc_assert (op_type == binary_op);
5268 14 : vop1 = (*vec_oprnds1)[i];
5269 :
5270 : /* Widen the first vector input. */
5271 14 : out_type = TREE_TYPE (vec_dest);
5272 14 : new_tmp1 = make_ssa_name (out_type);
5273 14 : new_stmt1 = gimple_build_assign (new_tmp1, NOP_EXPR, vop0);
5274 14 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt1, gsi);
5275 14 : if (VECTOR_TYPE_P (TREE_TYPE (vop1)))
5276 : {
5277 : /* Widen the second vector input. */
5278 14 : new_tmp2 = make_ssa_name (out_type);
5279 14 : new_stmt2 = gimple_build_assign (new_tmp2, NOP_EXPR, vop1);
5280 14 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt2, gsi);
5281 : /* Perform the operation. With both vector inputs widened. */
5282 14 : new_stmt3 = vect_gimple_build (vec_dest, code1, new_tmp1, new_tmp2);
5283 : }
5284 : else
5285 : {
5286 : /* Perform the operation. With the single vector input widened. */
5287 0 : new_stmt3 = vect_gimple_build (vec_dest, code1, new_tmp1, vop1);
5288 : }
5289 :
5290 14 : new_tmp3 = make_ssa_name (vec_dest, new_stmt3);
5291 14 : gimple_assign_set_lhs (new_stmt3, new_tmp3);
5292 14 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt3, gsi);
5293 :
5294 : /* Store the results for the next step. */
5295 14 : vec_tmp.quick_push (new_tmp3);
5296 : }
5297 :
5298 14 : vec_oprnds0->release ();
5299 14 : *vec_oprnds0 = vec_tmp;
5300 14 : }
5301 :
5302 :
5303 : /* Check if STMT_INFO performs a conversion operation that can be vectorized.
5304 : If COST_VEC is passed, calculate costs but don't change anything,
5305 : otherwise, vectorize STMT_INFO: create a vectorized stmt to replace
5306 : it, and insert it at GSI.
5307 : Return true if STMT_INFO is vectorizable in this way. */
5308 :
5309 : static bool
5310 2679063 : vectorizable_conversion (vec_info *vinfo,
5311 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5312 : slp_tree slp_node,
5313 : stmt_vector_for_cost *cost_vec)
5314 : {
5315 2679063 : tree vec_dest, cvt_op = NULL_TREE;
5316 2679063 : tree scalar_dest;
5317 2679063 : tree op0, op1 = NULL_TREE;
5318 2679063 : tree_code tc1;
5319 2679063 : code_helper code, code1, code2;
5320 2679063 : code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
5321 2679063 : tree new_temp;
5322 2679063 : enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
5323 2679063 : poly_uint64 nunits_in;
5324 2679063 : poly_uint64 nunits_out;
5325 2679063 : tree vectype_out, vectype_in;
5326 2679063 : int i;
5327 2679063 : tree lhs_type, rhs_type;
5328 : /* For conversions between floating point and integer, there're 2 NARROW
5329 : cases. NARROW_SRC is for FLOAT_EXPR, means
5330 : integer --DEMOTION--> integer --FLOAT_EXPR--> floating point.
5331 : This is safe when the range of the source integer can fit into the lower
5332 : precision. NARROW_DST is for FIX_TRUNC_EXPR, means
5333 : floating point --FIX_TRUNC_EXPR--> integer --DEMOTION--> INTEGER.
5334 : For other conversions, when there's narrowing, NARROW_DST is used as
5335 : default. */
5336 2679063 : enum { NARROW_SRC, NARROW_DST, NONE, WIDEN } modifier;
5337 2679063 : vec<tree> vec_oprnds0 = vNULL;
5338 2679063 : vec<tree> vec_oprnds1 = vNULL;
5339 2679063 : tree vop0;
5340 2679063 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5341 2679063 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5342 2679063 : int multi_step_cvt = 0;
5343 2679063 : vec<tree> interm_types = vNULL;
5344 2679063 : tree intermediate_type, cvt_type = NULL_TREE;
5345 2679063 : int op_type;
5346 2679063 : unsigned short fltsz;
5347 :
5348 : /* Is STMT a vectorizable conversion? */
5349 :
5350 2679063 : if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
5351 : return false;
5352 :
5353 2679063 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
5354 234683 : && cost_vec)
5355 : return false;
5356 :
5357 2444380 : gimple* stmt = stmt_info->stmt;
5358 2444380 : if (!(is_gimple_assign (stmt) || is_gimple_call (stmt)))
5359 : return false;
5360 :
5361 2385600 : if (gimple_get_lhs (stmt) == NULL_TREE
5362 2385600 : || TREE_CODE (gimple_get_lhs (stmt)) != SSA_NAME)
5363 815383 : return false;
5364 :
5365 1570217 : if (TREE_CODE (gimple_get_lhs (stmt)) != SSA_NAME)
5366 : return false;
5367 :
5368 1570217 : if (is_gimple_assign (stmt))
5369 : {
5370 1558236 : code = gimple_assign_rhs_code (stmt);
5371 1558236 : op_type = TREE_CODE_LENGTH ((tree_code) code);
5372 : }
5373 11981 : else if (gimple_call_internal_p (stmt))
5374 : {
5375 7862 : code = gimple_call_internal_fn (stmt);
5376 7862 : op_type = gimple_call_num_args (stmt);
5377 : }
5378 : else
5379 : return false;
5380 :
5381 1566098 : bool widen_arith = (code == WIDEN_MULT_EXPR
5382 1563777 : || code == WIDEN_LSHIFT_EXPR
5383 3129875 : || widening_fn_p (code));
5384 :
5385 1563777 : if (!widen_arith
5386 1563777 : && !CONVERT_EXPR_CODE_P (code)
5387 1401360 : && code != FIX_TRUNC_EXPR
5388 1399614 : && code != FLOAT_EXPR)
5389 : return false;
5390 :
5391 : /* Check types of lhs and rhs. */
5392 184740 : scalar_dest = gimple_get_lhs (stmt);
5393 184740 : lhs_type = TREE_TYPE (scalar_dest);
5394 184740 : vectype_out = SLP_TREE_VECTYPE (slp_node);
5395 :
5396 : /* Check the operands of the operation. */
5397 184740 : slp_tree slp_op0, slp_op1 = NULL;
5398 184740 : if (!vect_is_simple_use (vinfo, slp_node,
5399 : 0, &op0, &slp_op0, &dt[0], &vectype_in))
5400 : {
5401 0 : if (dump_enabled_p ())
5402 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5403 : "use not simple.\n");
5404 0 : return false;
5405 : }
5406 :
5407 184740 : rhs_type = TREE_TYPE (op0);
5408 182994 : if ((code != FIX_TRUNC_EXPR && code != FLOAT_EXPR)
5409 352912 : && !((INTEGRAL_TYPE_P (lhs_type)
5410 154789 : && INTEGRAL_TYPE_P (rhs_type))
5411 : || (SCALAR_FLOAT_TYPE_P (lhs_type)
5412 8825 : && SCALAR_FLOAT_TYPE_P (rhs_type))))
5413 : return false;
5414 :
5415 180182 : if (!VECTOR_BOOLEAN_TYPE_P (vectype_out)
5416 160063 : && INTEGRAL_TYPE_P (lhs_type)
5417 313164 : && !type_has_mode_precision_p (lhs_type))
5418 : {
5419 447 : if (dump_enabled_p ())
5420 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5421 : "type conversion to bit-precision unsupported\n");
5422 447 : return false;
5423 : }
5424 :
5425 179735 : if (op_type == binary_op)
5426 : {
5427 2321 : gcc_assert (code == WIDEN_MULT_EXPR
5428 : || code == WIDEN_LSHIFT_EXPR
5429 : || widening_fn_p (code));
5430 :
5431 2321 : op1 = is_gimple_assign (stmt) ? gimple_assign_rhs2 (stmt) :
5432 0 : gimple_call_arg (stmt, 0);
5433 2321 : tree vectype1_in;
5434 2321 : if (!vect_is_simple_use (vinfo, slp_node, 1,
5435 : &op1, &slp_op1, &dt[1], &vectype1_in))
5436 : {
5437 0 : if (dump_enabled_p ())
5438 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5439 : "use not simple.\n");
5440 0 : return false;
5441 : }
5442 : /* For WIDEN_MULT_EXPR, if OP0 is a constant, use the type of
5443 : OP1. */
5444 2321 : if (!vectype_in)
5445 102 : vectype_in = vectype1_in;
5446 : }
5447 :
5448 : /* If op0 is an external or constant def, infer the vector type
5449 : from the scalar type. */
5450 179735 : if (!vectype_in)
5451 19835 : vectype_in = get_vectype_for_scalar_type (vinfo, rhs_type, slp_node);
5452 179735 : if (!cost_vec)
5453 22892 : gcc_assert (vectype_in);
5454 179735 : if (!vectype_in)
5455 : {
5456 258 : if (dump_enabled_p ())
5457 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5458 : "no vectype for scalar type %T\n", rhs_type);
5459 :
5460 258 : return false;
5461 : }
5462 :
5463 358954 : if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
5464 179477 : != VECTOR_BOOLEAN_TYPE_P (vectype_in))
5465 : {
5466 229 : if (dump_enabled_p ())
5467 36 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5468 : "can't convert between boolean and non "
5469 : "boolean vectors %T\n", rhs_type);
5470 :
5471 229 : return false;
5472 : }
5473 :
5474 179248 : nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
5475 179248 : nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
5476 179248 : if (known_eq (nunits_out, nunits_in))
5477 85339 : if (widen_arith)
5478 : modifier = WIDEN;
5479 : else
5480 179248 : modifier = NONE;
5481 93909 : else if (multiple_p (nunits_out, nunits_in))
5482 : modifier = NARROW_DST;
5483 : else
5484 : {
5485 52022 : gcc_checking_assert (multiple_p (nunits_in, nunits_out));
5486 : modifier = WIDEN;
5487 : }
5488 :
5489 179248 : bool found_mode = false;
5490 179248 : scalar_mode lhs_mode = SCALAR_TYPE_MODE (lhs_type);
5491 179248 : scalar_mode rhs_mode = SCALAR_TYPE_MODE (rhs_type);
5492 179248 : opt_scalar_mode rhs_mode_iter;
5493 179248 : auto_vec<std::pair<tree, tree_code>, 2> converts;
5494 179248 : bool evenodd_ok = false;
5495 :
5496 : /* Supportable by target? */
5497 179248 : switch (modifier)
5498 : {
5499 85097 : case NONE:
5500 85097 : if (code != FIX_TRUNC_EXPR
5501 84045 : && code != FLOAT_EXPR
5502 160172 : && !CONVERT_EXPR_CODE_P (code))
5503 : return false;
5504 85097 : gcc_assert (code.is_tree_code ());
5505 85097 : if (supportable_indirect_convert_operation (code,
5506 : vectype_out, vectype_in,
5507 : converts, op0, slp_op0))
5508 : {
5509 18908 : gcc_assert (converts.length () <= 2);
5510 18908 : if (converts.length () == 1)
5511 18834 : code1 = converts[0].second;
5512 : else
5513 : {
5514 74 : cvt_type = NULL_TREE;
5515 74 : multi_step_cvt = converts.length () - 1;
5516 74 : codecvt1 = converts[0].second;
5517 74 : code1 = converts[1].second;
5518 74 : interm_types.safe_push (converts[0].first);
5519 : }
5520 : break;
5521 : }
5522 :
5523 : /* FALLTHRU */
5524 66189 : unsupported:
5525 73023 : if (dump_enabled_p ())
5526 6078 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5527 : "conversion not supported by target.\n");
5528 : return false;
5529 :
5530 52264 : case WIDEN:
5531 52264 : if (known_eq (nunits_in, nunits_out))
5532 : {
5533 484 : if (!(code.is_tree_code ()
5534 242 : && supportable_half_widening_operation ((tree_code) code,
5535 : vectype_out, vectype_in,
5536 : &tc1)))
5537 69 : goto unsupported;
5538 173 : code1 = tc1;
5539 173 : gcc_assert (!(multi_step_cvt && op_type == binary_op));
5540 : break;
5541 : }
5542 : /* Elements in a vector can only be reordered if used in a reduction
5543 : operation only. */
5544 52022 : if (code == WIDEN_MULT_EXPR
5545 2079 : && loop_vinfo
5546 2032 : && !nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo), stmt_info)
5547 : /* For a SLP reduction we cannot swizzle lanes, detecting a
5548 : reduction chain isn't possible here. */
5549 54032 : && SLP_TREE_LANES (slp_node) == 1)
5550 : {
5551 : /* ??? There is no way to look for SLP uses, so work on
5552 : the stmt and what the stmt-based cycle detection gives us. */
5553 1908 : tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
5554 1908 : stmt_vec_info use_stmt_info
5555 1908 : = lhs ? loop_vinfo->lookup_single_use (lhs) : NULL;
5556 1908 : if (use_stmt_info
5557 1759 : && STMT_VINFO_REDUC_DEF (use_stmt_info))
5558 52022 : evenodd_ok = true;
5559 : }
5560 52022 : if (supportable_widening_operation (code, vectype_out, vectype_in,
5561 : evenodd_ok, &code1,
5562 : &code2, &multi_step_cvt,
5563 : &interm_types))
5564 : {
5565 : /* Binary widening operation can only be supported directly by the
5566 : architecture. */
5567 50102 : gcc_assert (!(multi_step_cvt && op_type == binary_op));
5568 : break;
5569 : }
5570 :
5571 1920 : if (code != FLOAT_EXPR
5572 2292 : || GET_MODE_SIZE (lhs_mode) <= GET_MODE_SIZE (rhs_mode))
5573 1734 : goto unsupported;
5574 :
5575 186 : fltsz = GET_MODE_SIZE (lhs_mode);
5576 273 : FOR_EACH_2XWIDER_MODE (rhs_mode_iter, rhs_mode)
5577 : {
5578 273 : rhs_mode = rhs_mode_iter.require ();
5579 546 : if (GET_MODE_SIZE (rhs_mode) > fltsz)
5580 : break;
5581 :
5582 273 : cvt_type
5583 273 : = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 0);
5584 273 : cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
5585 273 : if (cvt_type == NULL_TREE)
5586 0 : goto unsupported;
5587 :
5588 546 : if (GET_MODE_SIZE (rhs_mode) == fltsz)
5589 : {
5590 81 : tc1 = ERROR_MARK;
5591 81 : gcc_assert (code.is_tree_code ());
5592 81 : if (!supportable_convert_operation ((tree_code) code, vectype_out,
5593 : cvt_type, &tc1))
5594 22 : goto unsupported;
5595 59 : codecvt1 = tc1;
5596 : }
5597 192 : else if (!supportable_widening_operation (code, vectype_out,
5598 : cvt_type, evenodd_ok,
5599 : &codecvt1,
5600 : &codecvt2, &multi_step_cvt,
5601 : &interm_types))
5602 87 : continue;
5603 : else
5604 105 : gcc_assert (multi_step_cvt == 0);
5605 :
5606 164 : if (supportable_widening_operation (NOP_EXPR, cvt_type,
5607 : vectype_in, evenodd_ok, &code1,
5608 : &code2, &multi_step_cvt,
5609 : &interm_types))
5610 : {
5611 : found_mode = true;
5612 : break;
5613 : }
5614 : }
5615 :
5616 164 : if (!found_mode)
5617 0 : goto unsupported;
5618 :
5619 328 : if (GET_MODE_SIZE (rhs_mode) == fltsz)
5620 59 : codecvt2 = ERROR_MARK;
5621 : else
5622 : {
5623 105 : multi_step_cvt++;
5624 105 : interm_types.safe_push (cvt_type);
5625 105 : cvt_type = NULL_TREE;
5626 : }
5627 : break;
5628 :
5629 41887 : case NARROW_DST:
5630 41887 : gcc_assert (op_type == unary_op);
5631 41887 : if (supportable_narrowing_operation (code, vectype_out, vectype_in,
5632 : &code1, &multi_step_cvt,
5633 : &interm_types))
5634 : break;
5635 :
5636 15411 : if (GET_MODE_SIZE (lhs_mode) >= GET_MODE_SIZE (rhs_mode))
5637 984 : goto unsupported;
5638 :
5639 4153 : if (code == FIX_TRUNC_EXPR)
5640 : {
5641 107 : cvt_type
5642 107 : = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 0);
5643 107 : cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
5644 107 : if (cvt_type == NULL_TREE)
5645 0 : goto unsupported;
5646 107 : if (supportable_convert_operation ((tree_code) code, cvt_type, vectype_in,
5647 : &tc1))
5648 105 : codecvt1 = tc1;
5649 : else
5650 2 : goto unsupported;
5651 105 : if (supportable_narrowing_operation (NOP_EXPR, vectype_out, cvt_type,
5652 : &code1, &multi_step_cvt,
5653 : &interm_types))
5654 : break;
5655 : }
5656 : /* If op0 can be represented with low precision integer,
5657 : truncate it to cvt_type and the do FLOAT_EXPR. */
5658 4046 : else if (code == FLOAT_EXPR)
5659 : {
5660 137 : if (cost_vec)
5661 : {
5662 132 : wide_int op_min_value, op_max_value;
5663 132 : tree def;
5664 :
5665 : /* ??? Merge ranges in case of more than one lane. */
5666 132 : if (SLP_TREE_LANES (slp_op0) != 1
5667 130 : || !(def = vect_get_slp_scalar_def (slp_op0, 0))
5668 262 : || !vect_get_range_info (def, &op_min_value, &op_max_value))
5669 106 : goto unsupported;
5670 :
5671 26 : if ((wi::min_precision (op_max_value, SIGNED)
5672 26 : > GET_MODE_BITSIZE (lhs_mode))
5673 26 : || (wi::min_precision (op_min_value, SIGNED)
5674 24 : > GET_MODE_BITSIZE (lhs_mode)))
5675 2 : goto unsupported;
5676 132 : }
5677 :
5678 29 : cvt_type
5679 29 : = build_nonstandard_integer_type (GET_MODE_BITSIZE (lhs_mode), 0);
5680 29 : cvt_type = get_same_sized_vectype (cvt_type, vectype_out);
5681 29 : if (cvt_type == NULL_TREE)
5682 0 : goto unsupported;
5683 29 : if (!supportable_narrowing_operation (NOP_EXPR, cvt_type, vectype_in,
5684 : &code1, &multi_step_cvt,
5685 : &interm_types))
5686 2 : goto unsupported;
5687 27 : if (supportable_convert_operation ((tree_code) code, vectype_out,
5688 : cvt_type, &tc1))
5689 : {
5690 27 : codecvt1 = tc1;
5691 27 : modifier = NARROW_SRC;
5692 27 : break;
5693 : }
5694 : }
5695 :
5696 3913 : goto unsupported;
5697 :
5698 : default:
5699 : gcc_unreachable ();
5700 : }
5701 :
5702 106225 : if (modifier == WIDEN
5703 106225 : && loop_vinfo
5704 49285 : && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
5705 127264 : && (code1 == VEC_WIDEN_MULT_EVEN_EXPR
5706 21017 : || widening_evenodd_fn_p (code1)))
5707 : {
5708 22 : if (dump_enabled_p ())
5709 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5710 : "can't use a fully-masked loop because"
5711 : " widening operation on even/odd elements"
5712 : " mixes up lanes.\n");
5713 22 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
5714 : }
5715 :
5716 106225 : if (cost_vec) /* transformation not required. */
5717 : {
5718 83333 : if (!vect_maybe_update_slp_op_vectype (slp_op0, vectype_in)
5719 83333 : || !vect_maybe_update_slp_op_vectype (slp_op1, vectype_in))
5720 : {
5721 0 : if (dump_enabled_p ())
5722 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5723 : "incompatible vector types for invariants\n");
5724 0 : return false;
5725 : }
5726 83333 : DUMP_VECT_SCOPE ("vectorizable_conversion");
5727 83333 : unsigned int nvectors = vect_get_num_copies (vinfo, slp_node);
5728 83333 : if (modifier == NONE)
5729 : {
5730 14910 : SLP_TREE_TYPE (slp_node) = type_conversion_vec_info_type;
5731 14910 : vect_model_simple_cost (vinfo, (1 + multi_step_cvt),
5732 : slp_node, cost_vec);
5733 : }
5734 68423 : else if (modifier == NARROW_SRC || modifier == NARROW_DST)
5735 : {
5736 27826 : SLP_TREE_TYPE (slp_node) = type_demotion_vec_info_type;
5737 : /* The final packing step produces one vector result per copy. */
5738 27826 : vect_model_promotion_demotion_cost (slp_node, nvectors,
5739 : multi_step_cvt, cost_vec,
5740 : widen_arith);
5741 : }
5742 : else
5743 : {
5744 40597 : SLP_TREE_TYPE (slp_node) = type_promotion_vec_info_type;
5745 : /* The initial unpacking step produces two vector results
5746 : per copy. MULTI_STEP_CVT is 0 for a single conversion,
5747 : so >> MULTI_STEP_CVT divides by 2^(number of steps - 1). */
5748 40597 : vect_model_promotion_demotion_cost (slp_node,
5749 : nvectors >> multi_step_cvt,
5750 : multi_step_cvt, cost_vec,
5751 : widen_arith);
5752 : }
5753 83333 : interm_types.release ();
5754 83333 : return true;
5755 83333 : }
5756 :
5757 : /* Transform. */
5758 22892 : if (dump_enabled_p ())
5759 4287 : dump_printf_loc (MSG_NOTE, vect_location, "transform conversion.\n");
5760 :
5761 22892 : if (op_type == binary_op)
5762 : {
5763 508 : if (CONSTANT_CLASS_P (op0))
5764 0 : op0 = fold_convert (TREE_TYPE (op1), op0);
5765 508 : else if (CONSTANT_CLASS_P (op1))
5766 234 : op1 = fold_convert (TREE_TYPE (op0), op1);
5767 : }
5768 :
5769 : /* In case of multi-step conversion, we first generate conversion operations
5770 : to the intermediate types, and then from that types to the final one.
5771 : We create vector destinations for the intermediate type (TYPES) received
5772 : from supportable_*_operation, and store them in the correct order
5773 : for future use in vect_create_vectorized_*_stmts (). */
5774 22892 : auto_vec<tree> vec_dsts (multi_step_cvt + 1);
5775 22892 : bool widen_or_narrow_float_p
5776 22892 : = cvt_type && (modifier == WIDEN || modifier == NARROW_SRC);
5777 22892 : vec_dest = vect_create_destination_var (scalar_dest,
5778 : widen_or_narrow_float_p
5779 : ? cvt_type : vectype_out);
5780 22892 : vec_dsts.quick_push (vec_dest);
5781 :
5782 22892 : if (multi_step_cvt)
5783 : {
5784 9148 : for (i = interm_types.length () - 1;
5785 9148 : interm_types.iterate (i, &intermediate_type); i--)
5786 : {
5787 4819 : vec_dest = vect_create_destination_var (scalar_dest,
5788 : intermediate_type);
5789 4819 : vec_dsts.quick_push (vec_dest);
5790 : }
5791 : }
5792 :
5793 22892 : if (cvt_type)
5794 73 : vec_dest = vect_create_destination_var (scalar_dest,
5795 : widen_or_narrow_float_p
5796 : ? vectype_out : cvt_type);
5797 :
5798 22892 : switch (modifier)
5799 : {
5800 3998 : case NONE:
5801 3998 : vect_get_vec_defs (vinfo, slp_node, op0, &vec_oprnds0);
5802 : /* vec_dest is intermediate type operand when multi_step_cvt. */
5803 3998 : if (multi_step_cvt)
5804 : {
5805 21 : cvt_op = vec_dest;
5806 21 : vec_dest = vec_dsts[0];
5807 : }
5808 :
5809 8372 : FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
5810 : {
5811 : /* Arguments are ready, create the new vector stmt. */
5812 4374 : gimple* new_stmt;
5813 4374 : if (multi_step_cvt)
5814 : {
5815 21 : gcc_assert (multi_step_cvt == 1);
5816 21 : new_stmt = vect_gimple_build (cvt_op, codecvt1, vop0);
5817 21 : new_temp = make_ssa_name (cvt_op, new_stmt);
5818 21 : gimple_assign_set_lhs (new_stmt, new_temp);
5819 21 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5820 21 : vop0 = new_temp;
5821 : }
5822 4374 : new_stmt = vect_gimple_build (vec_dest, code1, vop0);
5823 4374 : new_temp = make_ssa_name (vec_dest, new_stmt);
5824 4374 : gimple_set_lhs (new_stmt, new_temp);
5825 4374 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5826 :
5827 4374 : slp_node->push_vec_def (new_stmt);
5828 : }
5829 : break;
5830 :
5831 9842 : case WIDEN:
5832 : /* In case the vectorization factor (VF) is bigger than the number
5833 : of elements that we can fit in a vectype (nunits), we have to
5834 : generate more than one vector stmt - i.e - we need to "unroll"
5835 : the vector stmt by a factor VF/nunits. */
5836 9842 : vect_get_vec_defs (vinfo, slp_node, op0, &vec_oprnds0,
5837 9842 : code == WIDEN_LSHIFT_EXPR ? NULL_TREE : op1,
5838 : &vec_oprnds1);
5839 9842 : if (code == WIDEN_LSHIFT_EXPR)
5840 : {
5841 0 : int oprnds_size = vec_oprnds0.length ();
5842 0 : vec_oprnds1.create (oprnds_size);
5843 0 : for (i = 0; i < oprnds_size; ++i)
5844 0 : vec_oprnds1.quick_push (op1);
5845 : }
5846 : /* Arguments are ready. Create the new vector stmts. */
5847 21482 : for (i = multi_step_cvt; i >= 0; i--)
5848 : {
5849 11640 : tree this_dest = vec_dsts[i];
5850 11640 : code_helper c1 = code1, c2 = code2;
5851 11640 : if (i == 0 && codecvt2 != ERROR_MARK)
5852 : {
5853 48 : c1 = codecvt1;
5854 48 : c2 = codecvt2;
5855 : }
5856 11640 : if (known_eq (nunits_out, nunits_in))
5857 14 : vect_create_half_widening_stmts (vinfo, &vec_oprnds0, &vec_oprnds1,
5858 : stmt_info, this_dest, gsi, c1,
5859 : op_type);
5860 : else
5861 11626 : vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
5862 : &vec_oprnds1, stmt_info,
5863 : this_dest, gsi,
5864 : c1, c2, op_type);
5865 : }
5866 :
5867 37556 : FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
5868 : {
5869 27714 : gimple *new_stmt;
5870 27714 : if (cvt_type)
5871 : {
5872 120 : new_temp = make_ssa_name (vec_dest);
5873 120 : new_stmt = vect_gimple_build (new_temp, codecvt1, vop0);
5874 120 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5875 : }
5876 : else
5877 27594 : new_stmt = SSA_NAME_DEF_STMT (vop0);
5878 :
5879 27714 : slp_node->push_vec_def (new_stmt);
5880 : }
5881 : break;
5882 :
5883 9052 : case NARROW_SRC:
5884 9052 : case NARROW_DST:
5885 : /* In case the vectorization factor (VF) is bigger than the number
5886 : of elements that we can fit in a vectype (nunits), we have to
5887 : generate more than one vector stmt - i.e - we need to "unroll"
5888 : the vector stmt by a factor VF/nunits. */
5889 9052 : vect_get_vec_defs (vinfo, slp_node, op0, &vec_oprnds0);
5890 : /* Arguments are ready. Create the new vector stmts. */
5891 9052 : if (cvt_type && modifier == NARROW_DST)
5892 153 : FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
5893 : {
5894 124 : new_temp = make_ssa_name (vec_dest);
5895 124 : gimple *new_stmt = vect_gimple_build (new_temp, codecvt1, vop0);
5896 124 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5897 124 : vec_oprnds0[i] = new_temp;
5898 : }
5899 :
5900 9052 : vect_create_vectorized_demotion_stmts (vinfo, &vec_oprnds0,
5901 : multi_step_cvt,
5902 : stmt_info, vec_dsts, gsi,
5903 : slp_node, code1,
5904 : modifier == NARROW_SRC);
5905 : /* After demoting op0 to cvt_type, convert it to dest. */
5906 9052 : if (cvt_type && code == FLOAT_EXPR)
5907 : {
5908 10 : for (unsigned int i = 0; i != vec_oprnds0.length() / 2; i++)
5909 : {
5910 : /* Arguments are ready, create the new vector stmt. */
5911 5 : gcc_assert (TREE_CODE_LENGTH ((tree_code) codecvt1) == unary_op);
5912 5 : gimple *new_stmt
5913 5 : = vect_gimple_build (vec_dest, codecvt1, vec_oprnds0[i]);
5914 5 : new_temp = make_ssa_name (vec_dest, new_stmt);
5915 5 : gimple_set_lhs (new_stmt, new_temp);
5916 5 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
5917 :
5918 : /* This is the last step of the conversion sequence. Store the
5919 : vectors in SLP_NODE or in vector info of the scalar statement
5920 : (or in STMT_VINFO_RELATED_STMT chain). */
5921 5 : slp_node->push_vec_def (new_stmt);
5922 : }
5923 : }
5924 : break;
5925 : }
5926 :
5927 22892 : vec_oprnds0.release ();
5928 22892 : vec_oprnds1.release ();
5929 22892 : interm_types.release ();
5930 :
5931 22892 : return true;
5932 179248 : }
5933 :
5934 : /* Return true if we can assume from the scalar form of STMT_INFO that
5935 : neither the scalar nor the vector forms will generate code. STMT_INFO
5936 : is known not to involve a data reference. */
5937 :
5938 : bool
5939 3155396 : vect_nop_conversion_p (stmt_vec_info stmt_info)
5940 : {
5941 3155396 : gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
5942 2881896 : if (!stmt || STMT_VINFO_DATA_REF (stmt_info))
5943 : return false;
5944 :
5945 925259 : tree lhs = gimple_assign_lhs (stmt);
5946 925259 : tree_code code = gimple_assign_rhs_code (stmt);
5947 925259 : tree rhs = gimple_assign_rhs1 (stmt);
5948 :
5949 925259 : if (code == SSA_NAME || code == VIEW_CONVERT_EXPR)
5950 : return true;
5951 :
5952 922357 : if (CONVERT_EXPR_CODE_P (code))
5953 229058 : return tree_nop_conversion_p (TREE_TYPE (lhs), TREE_TYPE (rhs));
5954 :
5955 : return false;
5956 : }
5957 :
5958 : /* Function vectorizable_assignment.
5959 :
5960 : Check if STMT_INFO performs an assignment (copy) that can be vectorized.
5961 : If COST_VEC is passed, calculate costs but don't change anything,
5962 : otherwise, vectorize STMT_INFO: create a vectorized stmt to replace
5963 : it, and insert it at GSI.
5964 : Return true if STMT_INFO is vectorizable in this way. */
5965 :
5966 : static bool
5967 2052043 : vectorizable_assignment (vec_info *vinfo,
5968 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5969 : slp_tree slp_node,
5970 : stmt_vector_for_cost *cost_vec)
5971 : {
5972 2052043 : tree vec_dest;
5973 2052043 : tree scalar_dest;
5974 2052043 : tree op;
5975 2052043 : tree new_temp;
5976 2052043 : enum vect_def_type dt[1] = {vect_unknown_def_type};
5977 2052043 : int i;
5978 2052043 : vec<tree> vec_oprnds = vNULL;
5979 2052043 : tree vop;
5980 2052043 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
5981 2052043 : enum tree_code code;
5982 2052043 : tree vectype_in;
5983 :
5984 2052043 : if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
5985 : return false;
5986 :
5987 2052043 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
5988 234683 : && cost_vec)
5989 : return false;
5990 :
5991 : /* Is vectorizable assignment? */
5992 3712090 : gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
5993 1745244 : if (!stmt)
5994 : return false;
5995 :
5996 1745244 : scalar_dest = gimple_assign_lhs (stmt);
5997 1745244 : if (TREE_CODE (scalar_dest) != SSA_NAME)
5998 : return false;
5999 :
6000 931216 : if (STMT_VINFO_DATA_REF (stmt_info))
6001 : return false;
6002 :
6003 392112 : code = gimple_assign_rhs_code (stmt);
6004 392112 : if (!(gimple_assign_single_p (stmt)
6005 390583 : || code == PAREN_EXPR
6006 389407 : || CONVERT_EXPR_CODE_P (code)))
6007 : return false;
6008 :
6009 95799 : tree vectype = SLP_TREE_VECTYPE (slp_node);
6010 95799 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6011 :
6012 95799 : slp_tree slp_op;
6013 95799 : if (!vect_is_simple_use (vinfo, slp_node, 0, &op, &slp_op,
6014 : &dt[0], &vectype_in))
6015 : {
6016 0 : if (dump_enabled_p ())
6017 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6018 : "use not simple.\n");
6019 0 : return false;
6020 : }
6021 95799 : if (!vectype_in)
6022 17667 : vectype_in = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op), slp_node);
6023 :
6024 : /* We can handle VIEW_CONVERT conversions that do not change the number
6025 : of elements or the vector size or other conversions when the component
6026 : types are nop-convertible. */
6027 95799 : if (!vectype_in
6028 95521 : || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_in), nunits)
6029 88404 : || (code == VIEW_CONVERT_EXPR
6030 2802 : && maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
6031 2802 : GET_MODE_SIZE (TYPE_MODE (vectype_in))))
6032 184203 : || (CONVERT_EXPR_CODE_P (code)
6033 85731 : && !tree_nop_conversion_p (TREE_TYPE (vectype),
6034 85731 : TREE_TYPE (vectype_in))))
6035 10334 : return false;
6036 :
6037 256299 : if (VECTOR_BOOLEAN_TYPE_P (vectype) != VECTOR_BOOLEAN_TYPE_P (vectype_in))
6038 : {
6039 2 : if (dump_enabled_p ())
6040 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6041 : "can't convert between boolean and non "
6042 0 : "boolean vectors %T\n", TREE_TYPE (op));
6043 :
6044 2 : return false;
6045 : }
6046 :
6047 : /* We do not handle bit-precision changes. */
6048 85463 : if ((CONVERT_EXPR_CODE_P (code)
6049 2673 : || code == VIEW_CONVERT_EXPR)
6050 84191 : && ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
6051 82845 : && !type_has_mode_precision_p (TREE_TYPE (scalar_dest)))
6052 83876 : || (INTEGRAL_TYPE_P (TREE_TYPE (op))
6053 79069 : && !type_has_mode_precision_p (TREE_TYPE (op))))
6054 : /* But a conversion that does not change the bit-pattern is ok. */
6055 86193 : && !(INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest))
6056 730 : && INTEGRAL_TYPE_P (TREE_TYPE (op))
6057 730 : && (((TYPE_PRECISION (TREE_TYPE (scalar_dest))
6058 730 : > TYPE_PRECISION (TREE_TYPE (op)))
6059 415 : && TYPE_UNSIGNED (TREE_TYPE (op)))
6060 331 : || (TYPE_PRECISION (TREE_TYPE (scalar_dest))
6061 331 : == TYPE_PRECISION (TREE_TYPE (op))))))
6062 : {
6063 266 : if (dump_enabled_p ())
6064 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6065 : "type conversion to/from bit-precision "
6066 : "unsupported.\n");
6067 266 : return false;
6068 : }
6069 :
6070 85197 : if (cost_vec) /* transformation not required. */
6071 : {
6072 69173 : if (!vect_maybe_update_slp_op_vectype (slp_op, vectype_in))
6073 : {
6074 0 : if (dump_enabled_p ())
6075 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6076 : "incompatible vector types for invariants\n");
6077 0 : return false;
6078 : }
6079 69173 : SLP_TREE_TYPE (slp_node) = assignment_vec_info_type;
6080 69173 : DUMP_VECT_SCOPE ("vectorizable_assignment");
6081 69173 : if (!vect_nop_conversion_p (stmt_info))
6082 962 : vect_model_simple_cost (vinfo, 1, slp_node, cost_vec);
6083 69173 : return true;
6084 : }
6085 :
6086 : /* Transform. */
6087 16024 : if (dump_enabled_p ())
6088 3625 : dump_printf_loc (MSG_NOTE, vect_location, "transform assignment.\n");
6089 :
6090 : /* Handle def. */
6091 16024 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
6092 :
6093 : /* Handle use. */
6094 16024 : vect_get_vec_defs (vinfo, slp_node, op, &vec_oprnds);
6095 :
6096 : /* Arguments are ready. create the new vector stmt. */
6097 36189 : FOR_EACH_VEC_ELT (vec_oprnds, i, vop)
6098 : {
6099 20165 : if (CONVERT_EXPR_CODE_P (code)
6100 679 : || code == VIEW_CONVERT_EXPR)
6101 19620 : vop = build1 (VIEW_CONVERT_EXPR, vectype, vop);
6102 20165 : gassign *new_stmt = gimple_build_assign (vec_dest, vop);
6103 20165 : new_temp = make_ssa_name (vec_dest, new_stmt);
6104 20165 : gimple_assign_set_lhs (new_stmt, new_temp);
6105 20165 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6106 20165 : slp_node->push_vec_def (new_stmt);
6107 : }
6108 :
6109 16024 : vec_oprnds.release ();
6110 16024 : return true;
6111 : }
6112 :
6113 :
6114 : /* Return TRUE if CODE (a shift operation) is supported for SCALAR_TYPE
6115 : either as shift by a scalar or by a vector. */
6116 :
6117 : bool
6118 298191 : vect_supportable_shift (vec_info *vinfo, enum tree_code code, tree scalar_type)
6119 : {
6120 298191 : optab optab;
6121 298191 : tree vectype;
6122 :
6123 298191 : vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
6124 298191 : if (!vectype)
6125 : return false;
6126 :
6127 298191 : optab = optab_for_tree_code (code, vectype, optab_scalar);
6128 298191 : if (optab && can_implement_p (optab, TYPE_MODE (vectype)))
6129 : return true;
6130 :
6131 262058 : optab = optab_for_tree_code (code, vectype, optab_vector);
6132 262058 : if (optab && can_implement_p (optab, TYPE_MODE (vectype)))
6133 : return true;
6134 :
6135 : return false;
6136 : }
6137 :
6138 :
6139 : /* Function vectorizable_shift.
6140 :
6141 : Check if STMT_INFO performs a shift operation that can be vectorized.
6142 : If COST_VEC is passed, calculate costs but don't change anything,
6143 : otherwise, vectorize STMT_INFO: create a vectorized stmt to replace
6144 : it, and insert it at GSI.
6145 : Return true if STMT_INFO is vectorizable in this way. */
6146 :
6147 : static bool
6148 726048 : vectorizable_shift (vec_info *vinfo,
6149 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6150 : slp_tree slp_node,
6151 : stmt_vector_for_cost *cost_vec)
6152 : {
6153 726048 : tree vec_dest;
6154 726048 : tree scalar_dest;
6155 726048 : tree op0, op1 = NULL;
6156 726048 : tree vec_oprnd1 = NULL_TREE;
6157 726048 : tree vectype;
6158 726048 : enum tree_code code;
6159 726048 : machine_mode vec_mode;
6160 726048 : tree new_temp;
6161 726048 : optab optab;
6162 726048 : int icode;
6163 726048 : machine_mode optab_op2_mode;
6164 726048 : enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type};
6165 726048 : poly_uint64 nunits_in;
6166 726048 : poly_uint64 nunits_out;
6167 726048 : tree vectype_out;
6168 726048 : tree op1_vectype;
6169 726048 : int i;
6170 726048 : vec<tree> vec_oprnds0 = vNULL;
6171 726048 : vec<tree> vec_oprnds1 = vNULL;
6172 726048 : tree vop0, vop1;
6173 726048 : unsigned int k;
6174 726048 : bool scalar_shift_arg = true;
6175 726048 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
6176 726048 : bool incompatible_op1_vectype_p = false;
6177 :
6178 726048 : if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
6179 : return false;
6180 :
6181 726048 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
6182 234683 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle
6183 233193 : && cost_vec)
6184 : return false;
6185 :
6186 : /* Is STMT a vectorizable binary/unary operation? */
6187 1089472 : gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
6188 421739 : if (!stmt)
6189 : return false;
6190 :
6191 421739 : if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
6192 : return false;
6193 :
6194 421181 : code = gimple_assign_rhs_code (stmt);
6195 :
6196 421181 : if (!(code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
6197 : || code == RROTATE_EXPR))
6198 : return false;
6199 :
6200 64437 : scalar_dest = gimple_assign_lhs (stmt);
6201 64437 : vectype_out = SLP_TREE_VECTYPE (slp_node);
6202 64437 : if (!type_has_mode_precision_p (TREE_TYPE (scalar_dest)))
6203 : {
6204 0 : if (dump_enabled_p ())
6205 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6206 : "bit-precision shifts not supported.\n");
6207 0 : return false;
6208 : }
6209 :
6210 64437 : slp_tree slp_op0;
6211 64437 : if (!vect_is_simple_use (vinfo, slp_node,
6212 : 0, &op0, &slp_op0, &dt[0], &vectype))
6213 : {
6214 0 : if (dump_enabled_p ())
6215 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6216 : "use not simple.\n");
6217 0 : return false;
6218 : }
6219 : /* If op0 is an external or constant def, infer the vector type
6220 : from the scalar type. */
6221 64437 : if (!vectype)
6222 15480 : vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0), slp_node);
6223 64437 : if (!cost_vec)
6224 8560 : gcc_assert (vectype);
6225 64437 : if (!vectype)
6226 : {
6227 0 : if (dump_enabled_p ())
6228 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6229 : "no vectype for scalar type\n");
6230 0 : return false;
6231 : }
6232 :
6233 64437 : nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6234 64437 : nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
6235 64437 : if (maybe_ne (nunits_out, nunits_in))
6236 : return false;
6237 :
6238 64437 : stmt_vec_info op1_def_stmt_info;
6239 64437 : slp_tree slp_op1;
6240 64437 : if (!vect_is_simple_use (vinfo, slp_node, 1, &op1, &slp_op1,
6241 : &dt[1], &op1_vectype, &op1_def_stmt_info))
6242 : {
6243 0 : if (dump_enabled_p ())
6244 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6245 : "use not simple.\n");
6246 0 : return false;
6247 : }
6248 :
6249 : /* Determine whether the shift amount is a vector, or scalar. If the
6250 : shift/rotate amount is a vector, use the vector/vector shift optabs. */
6251 :
6252 64437 : if ((dt[1] == vect_internal_def
6253 64437 : || dt[1] == vect_induction_def
6254 48112 : || dt[1] == vect_nested_cycle)
6255 16343 : && SLP_TREE_LANES (slp_node) == 1)
6256 : scalar_shift_arg = false;
6257 48149 : else if (dt[1] == vect_constant_def
6258 : || dt[1] == vect_external_def
6259 48149 : || dt[1] == vect_internal_def)
6260 : {
6261 : /* In SLP, need to check whether the shift count is the same,
6262 : in loops if it is a constant or invariant, it is always
6263 : a scalar shift. */
6264 48143 : vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6265 48143 : stmt_vec_info slpstmt_info;
6266 :
6267 126221 : FOR_EACH_VEC_ELT (stmts, k, slpstmt_info)
6268 78078 : if (slpstmt_info)
6269 : {
6270 78078 : gassign *slpstmt = as_a <gassign *> (slpstmt_info->stmt);
6271 156156 : if (!operand_equal_p (gimple_assign_rhs2 (slpstmt), op1, 0))
6272 78078 : scalar_shift_arg = false;
6273 : }
6274 :
6275 : /* For internal SLP defs we have to make sure we see scalar stmts
6276 : for all vector elements.
6277 : ??? For different vectors we could resort to a different
6278 : scalar shift operand but code-generation below simply always
6279 : takes the first. */
6280 48143 : if (dt[1] == vect_internal_def
6281 48192 : && maybe_ne (nunits_out * vect_get_num_copies (vinfo, slp_node),
6282 49 : stmts.length ()))
6283 : scalar_shift_arg = false;
6284 :
6285 : /* If the shift amount is computed by a pattern stmt we cannot
6286 : use the scalar amount directly thus give up and use a vector
6287 : shift. */
6288 48143 : if (op1_def_stmt_info && is_pattern_stmt_p (op1_def_stmt_info))
6289 : scalar_shift_arg = false;
6290 : }
6291 : else
6292 : {
6293 6 : if (dump_enabled_p ())
6294 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6295 : "operand mode requires invariant argument.\n");
6296 6 : return false;
6297 : }
6298 :
6299 : /* Vector shifted by vector. */
6300 64469 : bool was_scalar_shift_arg = scalar_shift_arg;
6301 48134 : if (!scalar_shift_arg)
6302 : {
6303 16335 : optab = optab_for_tree_code (code, vectype, optab_vector);
6304 16335 : if (dump_enabled_p ())
6305 1205 : dump_printf_loc (MSG_NOTE, vect_location,
6306 : "vector/vector shift/rotate found.\n");
6307 :
6308 16335 : if (!op1_vectype)
6309 15 : op1_vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op1),
6310 : slp_op1);
6311 16335 : incompatible_op1_vectype_p
6312 32670 : = (op1_vectype == NULL_TREE
6313 16335 : || maybe_ne (TYPE_VECTOR_SUBPARTS (op1_vectype),
6314 16335 : TYPE_VECTOR_SUBPARTS (vectype))
6315 32668 : || TYPE_MODE (op1_vectype) != TYPE_MODE (vectype));
6316 16328 : if (incompatible_op1_vectype_p
6317 7 : && (SLP_TREE_DEF_TYPE (slp_op1) != vect_constant_def
6318 1 : || slp_op1->refcnt != 1))
6319 : {
6320 6 : if (dump_enabled_p ())
6321 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6322 : "unusable type for last operand in"
6323 : " vector/vector shift/rotate.\n");
6324 6 : return false;
6325 : }
6326 : }
6327 : /* See if the machine has a vector shifted by scalar insn and if not
6328 : then see if it has a vector shifted by vector insn. */
6329 : else
6330 : {
6331 48096 : optab = optab_for_tree_code (code, vectype, optab_scalar);
6332 48096 : if (optab
6333 48096 : && can_implement_p (optab, TYPE_MODE (vectype)))
6334 : {
6335 48096 : if (dump_enabled_p ())
6336 4946 : dump_printf_loc (MSG_NOTE, vect_location,
6337 : "vector/scalar shift/rotate found.\n");
6338 : }
6339 : else
6340 : {
6341 0 : optab = optab_for_tree_code (code, vectype, optab_vector);
6342 0 : if (optab
6343 0 : && can_implement_p (optab, TYPE_MODE (vectype)))
6344 : {
6345 0 : scalar_shift_arg = false;
6346 :
6347 0 : if (dump_enabled_p ())
6348 0 : dump_printf_loc (MSG_NOTE, vect_location,
6349 : "vector/vector shift/rotate found.\n");
6350 :
6351 0 : if (!op1_vectype)
6352 0 : op1_vectype = get_vectype_for_scalar_type (vinfo,
6353 0 : TREE_TYPE (op1),
6354 : slp_op1);
6355 :
6356 : /* Unlike the other binary operators, shifts/rotates have
6357 : the rhs being int, instead of the same type as the lhs,
6358 : so make sure the scalar is the right type if we are
6359 : dealing with vectors of long long/long/short/char. */
6360 0 : incompatible_op1_vectype_p
6361 0 : = (!op1_vectype
6362 0 : || !tree_nop_conversion_p (TREE_TYPE (vectype),
6363 0 : TREE_TYPE (op1)));
6364 0 : if (incompatible_op1_vectype_p
6365 0 : && dt[1] == vect_internal_def)
6366 : {
6367 0 : if (dump_enabled_p ())
6368 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6369 : "unusable type for last operand in"
6370 : " vector/vector shift/rotate.\n");
6371 0 : return false;
6372 : }
6373 : }
6374 : }
6375 : }
6376 :
6377 : /* Supportable by target? */
6378 64425 : if (!optab)
6379 : {
6380 0 : if (dump_enabled_p ())
6381 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6382 : "no shift optab for %s and %T.\n",
6383 : get_tree_code_name (code), vectype);
6384 0 : return false;
6385 : }
6386 64425 : vec_mode = TYPE_MODE (vectype);
6387 64425 : icode = (int) optab_handler (optab, vec_mode);
6388 64425 : if (icode == CODE_FOR_nothing)
6389 : {
6390 6110 : if (dump_enabled_p ())
6391 900 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6392 : "shift op not supported by target.\n");
6393 6110 : return false;
6394 : }
6395 : /* vector lowering cannot optimize vector shifts using word arithmetic. */
6396 58315 : if (vect_emulated_vector_p (vectype))
6397 : return false;
6398 :
6399 58315 : if (cost_vec) /* transformation not required. */
6400 : {
6401 49755 : if (!vect_maybe_update_slp_op_vectype (slp_op0, vectype)
6402 49755 : || ((!scalar_shift_arg || dt[1] == vect_internal_def)
6403 8077 : && (!incompatible_op1_vectype_p
6404 1 : || dt[1] == vect_constant_def)
6405 8077 : && !vect_maybe_update_slp_op_vectype
6406 8077 : (slp_op1,
6407 : incompatible_op1_vectype_p ? vectype : op1_vectype)))
6408 : {
6409 0 : if (dump_enabled_p ())
6410 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6411 : "incompatible vector types for invariants\n");
6412 0 : return false;
6413 : }
6414 : /* Now adjust the constant shift amount in place. */
6415 49755 : if (incompatible_op1_vectype_p
6416 1 : && dt[1] == vect_constant_def)
6417 4 : for (unsigned i = 0;
6418 5 : i < SLP_TREE_SCALAR_OPS (slp_op1).length (); ++i)
6419 : {
6420 4 : SLP_TREE_SCALAR_OPS (slp_op1)[i]
6421 4 : = fold_convert (TREE_TYPE (vectype),
6422 : SLP_TREE_SCALAR_OPS (slp_op1)[i]);
6423 4 : gcc_assert ((TREE_CODE (SLP_TREE_SCALAR_OPS (slp_op1)[i])
6424 : == INTEGER_CST));
6425 : }
6426 49755 : SLP_TREE_TYPE (slp_node) = shift_vec_info_type;
6427 49755 : DUMP_VECT_SCOPE ("vectorizable_shift");
6428 49755 : vect_model_simple_cost (vinfo, 1, slp_node, cost_vec);
6429 49755 : return true;
6430 : }
6431 :
6432 : /* Transform. */
6433 :
6434 8560 : if (dump_enabled_p ())
6435 2033 : dump_printf_loc (MSG_NOTE, vect_location,
6436 : "transform binary/unary operation.\n");
6437 :
6438 : /* Handle def. */
6439 8560 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
6440 :
6441 8560 : unsigned nvectors = vect_get_num_copies (vinfo, slp_node);
6442 8560 : if (scalar_shift_arg && dt[1] != vect_internal_def)
6443 : {
6444 : /* Vector shl and shr insn patterns can be defined with scalar
6445 : operand 2 (shift operand). In this case, use constant or loop
6446 : invariant op1 directly, without extending it to vector mode
6447 : first. */
6448 6398 : optab_op2_mode = insn_data[icode].operand[2].mode;
6449 6398 : if (!VECTOR_MODE_P (optab_op2_mode))
6450 : {
6451 6398 : if (dump_enabled_p ())
6452 1918 : dump_printf_loc (MSG_NOTE, vect_location,
6453 : "operand 1 using scalar mode.\n");
6454 6398 : vec_oprnd1 = op1;
6455 6398 : vec_oprnds1.create (nvectors);
6456 6398 : vec_oprnds1.quick_push (vec_oprnd1);
6457 : /* Store vec_oprnd1 for every vector stmt to be created.
6458 : We check during the analysis that all the shift arguments
6459 : are the same.
6460 : TODO: Allow different constants for different vector
6461 : stmts generated for an SLP instance. */
6462 14865 : for (k = 0; k < nvectors - 1; k++)
6463 2069 : vec_oprnds1.quick_push (vec_oprnd1);
6464 : }
6465 : }
6466 2162 : else if (!scalar_shift_arg && incompatible_op1_vectype_p)
6467 : {
6468 0 : if (was_scalar_shift_arg)
6469 : {
6470 : /* If the argument was the same in all lanes create the
6471 : correctly typed vector shift amount directly. Note
6472 : we made SLP scheduling think we use the original scalars,
6473 : so place the compensation code next to the shift which
6474 : is conservative. See PR119640 where it otherwise breaks. */
6475 0 : op1 = fold_convert (TREE_TYPE (vectype), op1);
6476 0 : op1 = vect_init_vector (vinfo, stmt_info, op1, TREE_TYPE (vectype),
6477 : gsi);
6478 0 : vec_oprnd1 = vect_init_vector (vinfo, stmt_info, op1, vectype,
6479 : gsi);
6480 0 : vec_oprnds1.create (nvectors);
6481 0 : for (k = 0; k < nvectors; k++)
6482 0 : vec_oprnds1.quick_push (vec_oprnd1);
6483 : }
6484 0 : else if (dt[1] == vect_constant_def)
6485 : /* The constant shift amount has been adjusted in place. */
6486 : ;
6487 : else
6488 0 : gcc_assert (TYPE_MODE (op1_vectype) == TYPE_MODE (vectype));
6489 : }
6490 :
6491 : /* vec_oprnd1 is available if operand 1 should be of a scalar-type
6492 : (a special case for certain kind of vector shifts); otherwise,
6493 : operand 1 should be of a vector type (the usual case). */
6494 2162 : vect_get_vec_defs (vinfo, slp_node,
6495 : op0, &vec_oprnds0,
6496 8560 : vec_oprnd1 ? NULL_TREE : op1, &vec_oprnds1);
6497 :
6498 : /* Arguments are ready. Create the new vector stmt. */
6499 22691 : FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
6500 : {
6501 : /* For internal defs where we need to use a scalar shift arg
6502 : extract the first lane. */
6503 14131 : if (scalar_shift_arg && dt[1] == vect_internal_def)
6504 : {
6505 10 : vop1 = vec_oprnds1[0];
6506 10 : new_temp = make_ssa_name (TREE_TYPE (TREE_TYPE (vop1)));
6507 10 : gassign *new_stmt
6508 10 : = gimple_build_assign (new_temp,
6509 10 : build3 (BIT_FIELD_REF, TREE_TYPE (new_temp),
6510 : vop1,
6511 10 : TYPE_SIZE (TREE_TYPE (new_temp)),
6512 : bitsize_zero_node));
6513 10 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6514 10 : vop1 = new_temp;
6515 10 : }
6516 : else
6517 14121 : vop1 = vec_oprnds1[i];
6518 14131 : gassign *new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1);
6519 14131 : new_temp = make_ssa_name (vec_dest, new_stmt);
6520 14131 : gimple_assign_set_lhs (new_stmt, new_temp);
6521 14131 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6522 14131 : slp_node->push_vec_def (new_stmt);
6523 : }
6524 :
6525 8560 : vec_oprnds0.release ();
6526 8560 : vec_oprnds1.release ();
6527 :
6528 8560 : return true;
6529 : }
6530 :
6531 : /* Function vectorizable_operation.
6532 :
6533 : Check if STMT_INFO performs a binary, unary or ternary operation that can
6534 : be vectorized.
6535 : If COST_VEC is passed, calculate costs but don't change anything,
6536 : otherwise, vectorize STMT_INFO: create a vectorized stmt to replace
6537 : it, and insert it at GSI.
6538 : Return true if STMT_INFO is vectorizable in this way. */
6539 :
6540 : static bool
6541 2687426 : vectorizable_operation (vec_info *vinfo,
6542 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6543 : slp_tree slp_node,
6544 : stmt_vector_for_cost *cost_vec)
6545 : {
6546 2687426 : tree vec_dest;
6547 2687426 : tree scalar_dest;
6548 2687426 : tree op0, op1 = NULL_TREE, op2 = NULL_TREE;
6549 2687426 : tree vectype;
6550 2687426 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6551 2687426 : enum tree_code code, orig_code;
6552 2687426 : machine_mode vec_mode;
6553 2687426 : tree new_temp;
6554 2687426 : int op_type;
6555 2687426 : optab optab;
6556 2687426 : bool target_support_p;
6557 2687426 : enum vect_def_type dt[3]
6558 : = {vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type};
6559 2687426 : poly_uint64 nunits_in;
6560 2687426 : poly_uint64 nunits_out;
6561 2687426 : tree vectype_out;
6562 2687426 : int i;
6563 2687426 : vec<tree> vec_oprnds0 = vNULL;
6564 2687426 : vec<tree> vec_oprnds1 = vNULL;
6565 2687426 : vec<tree> vec_oprnds2 = vNULL;
6566 2687426 : tree vop0, vop1, vop2;
6567 2687426 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
6568 :
6569 2687426 : if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
6570 : return false;
6571 :
6572 2687426 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
6573 234683 : && cost_vec)
6574 : return false;
6575 :
6576 : /* Is STMT a vectorizable binary/unary operation? */
6577 4416646 : gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
6578 2380627 : if (!stmt)
6579 : return false;
6580 :
6581 : /* Loads and stores are handled in vectorizable_{load,store}. */
6582 2380627 : if (STMT_VINFO_DATA_REF (stmt_info))
6583 : return false;
6584 :
6585 1027495 : orig_code = code = gimple_assign_rhs_code (stmt);
6586 :
6587 : /* Shifts are handled in vectorizable_shift. */
6588 1027495 : if (code == LSHIFT_EXPR
6589 : || code == RSHIFT_EXPR
6590 : || code == LROTATE_EXPR
6591 1027495 : || code == RROTATE_EXPR)
6592 : return false;
6593 :
6594 : /* Comparisons are handled in vectorizable_comparison. */
6595 971618 : if (TREE_CODE_CLASS (code) == tcc_comparison)
6596 : return false;
6597 :
6598 : /* Conditions are handled in vectorizable_condition. */
6599 787542 : if (code == COND_EXPR)
6600 : return false;
6601 :
6602 : /* For pointer addition and subtraction, we should use the normal
6603 : plus and minus for the vector operation. */
6604 761193 : if (code == POINTER_PLUS_EXPR)
6605 : code = PLUS_EXPR;
6606 742591 : if (code == POINTER_DIFF_EXPR)
6607 945 : code = MINUS_EXPR;
6608 :
6609 : /* Support only unary or binary operations. */
6610 761193 : op_type = TREE_CODE_LENGTH (code);
6611 761193 : if (op_type != unary_op && op_type != binary_op && op_type != ternary_op)
6612 : {
6613 0 : if (dump_enabled_p ())
6614 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6615 : "num. args = %d (not unary/binary/ternary op).\n",
6616 : op_type);
6617 0 : return false;
6618 : }
6619 :
6620 761193 : scalar_dest = gimple_assign_lhs (stmt);
6621 761193 : vectype_out = SLP_TREE_VECTYPE (slp_node);
6622 :
6623 : /* Most operations cannot handle bit-precision types without extra
6624 : truncations. */
6625 761193 : bool mask_op_p = VECTOR_BOOLEAN_TYPE_P (vectype_out);
6626 750151 : if (!mask_op_p
6627 750151 : && !type_has_mode_precision_p (TREE_TYPE (scalar_dest))
6628 : /* Exception are bitwise binary operations. */
6629 : && code != BIT_IOR_EXPR
6630 1400 : && code != BIT_XOR_EXPR
6631 894 : && code != BIT_AND_EXPR)
6632 : {
6633 690 : if (dump_enabled_p ())
6634 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6635 : "bit-precision arithmetic not supported.\n");
6636 690 : return false;
6637 : }
6638 :
6639 760503 : slp_tree slp_op0;
6640 760503 : if (!vect_is_simple_use (vinfo, slp_node,
6641 : 0, &op0, &slp_op0, &dt[0], &vectype))
6642 : {
6643 0 : if (dump_enabled_p ())
6644 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6645 : "use not simple.\n");
6646 0 : return false;
6647 : }
6648 760503 : bool is_invariant = (dt[0] == vect_external_def
6649 760503 : || dt[0] == vect_constant_def);
6650 : /* If op0 is an external or constant def, infer the vector type
6651 : from the scalar type. */
6652 760503 : if (!vectype)
6653 : {
6654 : /* For boolean type we cannot determine vectype by
6655 : invariant value (don't know whether it is a vector
6656 : of booleans or vector of integers). We use output
6657 : vectype because operations on boolean don't change
6658 : type. */
6659 66915 : if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op0)))
6660 : {
6661 1418 : if (!VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (scalar_dest)))
6662 : {
6663 228 : if (dump_enabled_p ())
6664 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6665 : "not supported operation on bool value.\n");
6666 228 : return false;
6667 : }
6668 1190 : vectype = vectype_out;
6669 : }
6670 : else
6671 65497 : vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (op0),
6672 : slp_node);
6673 : }
6674 760275 : if (!cost_vec)
6675 114588 : gcc_assert (vectype);
6676 760275 : if (!vectype)
6677 : {
6678 290 : if (dump_enabled_p ())
6679 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6680 : "no vectype for scalar type %T\n",
6681 2 : TREE_TYPE (op0));
6682 :
6683 290 : return false;
6684 : }
6685 :
6686 759985 : nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6687 759985 : nunits_in = TYPE_VECTOR_SUBPARTS (vectype);
6688 759985 : if (maybe_ne (nunits_out, nunits_in)
6689 759985 : || !tree_nop_conversion_p (TREE_TYPE (vectype_out), TREE_TYPE (vectype)))
6690 11731 : return false;
6691 :
6692 748254 : tree vectype2 = NULL_TREE, vectype3 = NULL_TREE;
6693 748254 : slp_tree slp_op1 = NULL, slp_op2 = NULL;
6694 748254 : if (op_type == binary_op || op_type == ternary_op)
6695 : {
6696 669385 : if (!vect_is_simple_use (vinfo, slp_node,
6697 : 1, &op1, &slp_op1, &dt[1], &vectype2))
6698 : {
6699 0 : if (dump_enabled_p ())
6700 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6701 : "use not simple.\n");
6702 0 : return false;
6703 : }
6704 669385 : is_invariant &= (dt[1] == vect_external_def
6705 669385 : || dt[1] == vect_constant_def);
6706 669385 : if (vectype2
6707 1136738 : && (maybe_ne (nunits_out, TYPE_VECTOR_SUBPARTS (vectype2))
6708 467353 : || !tree_nop_conversion_p (TREE_TYPE (vectype_out),
6709 467353 : TREE_TYPE (vectype2))))
6710 4 : return false;
6711 : }
6712 748250 : if (op_type == ternary_op)
6713 : {
6714 0 : if (!vect_is_simple_use (vinfo, slp_node,
6715 : 2, &op2, &slp_op2, &dt[2], &vectype3))
6716 : {
6717 0 : if (dump_enabled_p ())
6718 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6719 : "use not simple.\n");
6720 0 : return false;
6721 : }
6722 0 : is_invariant &= (dt[2] == vect_external_def
6723 0 : || dt[2] == vect_constant_def);
6724 0 : if (vectype3
6725 0 : && (maybe_ne (nunits_out, TYPE_VECTOR_SUBPARTS (vectype3))
6726 0 : || !tree_nop_conversion_p (TREE_TYPE (vectype_out),
6727 0 : TREE_TYPE (vectype3))))
6728 0 : return false;
6729 : }
6730 :
6731 : /* Multiple types in SLP are handled by creating the appropriate number of
6732 : vectorized stmts for each SLP node. */
6733 748250 : auto vec_num = vect_get_num_copies (vinfo, slp_node);
6734 :
6735 : /* Reject attempts to combine mask types with nonmask types, e.g. if
6736 : we have an AND between a (nonmask) boolean loaded from memory and
6737 : a (mask) boolean result of a comparison.
6738 :
6739 : TODO: We could easily fix these cases up using pattern statements. */
6740 748250 : if (VECTOR_BOOLEAN_TYPE_P (vectype) != mask_op_p
6741 1207654 : || (vectype2 && VECTOR_BOOLEAN_TYPE_P (vectype2) != mask_op_p)
6742 1496500 : || (vectype3 && VECTOR_BOOLEAN_TYPE_P (vectype3) != mask_op_p))
6743 : {
6744 0 : if (dump_enabled_p ())
6745 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6746 : "mixed mask and nonmask vector types\n");
6747 0 : return false;
6748 : }
6749 :
6750 : /* Supportable by target? */
6751 :
6752 748250 : vec_mode = TYPE_MODE (vectype);
6753 748250 : optab = optab_for_tree_code (code, vectype, optab_default);
6754 748250 : if (!optab)
6755 : {
6756 68451 : if (dump_enabled_p ())
6757 5961 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6758 : "no optab for %s and %T.\n",
6759 : get_tree_code_name (code), vectype);
6760 68451 : return false;
6761 : }
6762 679799 : target_support_p = can_implement_p (optab, vec_mode);
6763 :
6764 679799 : bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
6765 679799 : if (!target_support_p || using_emulated_vectors_p)
6766 : {
6767 28815 : if (dump_enabled_p ())
6768 1124 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769 : "op not supported by target.\n");
6770 : /* When vec_mode is not a vector mode and we verified ops we
6771 : do not have to lower like AND are natively supported let
6772 : those through even when the mode isn't word_mode. For
6773 : ops we have to lower the lowering code assumes we are
6774 : dealing with word_mode. */
6775 57630 : if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype))
6776 28673 : || !GET_MODE_SIZE (vec_mode).is_constant ()
6777 28673 : || (((code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR)
6778 23691 : || !target_support_p)
6779 60880 : && maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD))
6780 : /* Check only during analysis. */
6781 40564 : || (cost_vec && !vect_can_vectorize_without_simd_p (code)))
6782 : {
6783 28253 : if (dump_enabled_p ())
6784 1122 : dump_printf (MSG_NOTE, "using word mode not possible.\n");
6785 28253 : return false;
6786 : }
6787 562 : if (dump_enabled_p ())
6788 2 : dump_printf_loc (MSG_NOTE, vect_location,
6789 : "proceeding using word mode.\n");
6790 : using_emulated_vectors_p = true;
6791 : }
6792 :
6793 651546 : int reduc_idx = SLP_TREE_REDUC_IDX (slp_node);
6794 651546 : vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
6795 433464 : vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
6796 651546 : internal_fn cond_fn = get_conditional_internal_fn (code);
6797 651546 : internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
6798 :
6799 : /* If operating on inactive elements could generate spurious traps,
6800 : we need to restrict the operation to active lanes. Note that this
6801 : specifically doesn't apply to unhoisted invariants, since they
6802 : operate on the same value for every lane.
6803 :
6804 : Similarly, if this operation is part of a reduction, a fully-masked
6805 : loop should only change the active lanes of the reduction chain,
6806 : keeping the inactive lanes as-is. */
6807 624939 : bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
6808 1212949 : || reduc_idx >= 0);
6809 :
6810 651546 : if (cost_vec) /* transformation not required. */
6811 : {
6812 536958 : if (loop_vinfo
6813 330247 : && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
6814 88784 : && mask_out_inactive)
6815 : {
6816 20416 : if (cond_len_fn != IFN_LAST
6817 20416 : && direct_internal_fn_supported_p (cond_len_fn, vectype,
6818 : OPTIMIZE_FOR_SPEED))
6819 0 : vect_record_loop_len (loop_vinfo, lens, vec_num, vectype,
6820 : 1);
6821 20416 : else if (cond_fn != IFN_LAST
6822 20416 : && direct_internal_fn_supported_p (cond_fn, vectype,
6823 : OPTIMIZE_FOR_SPEED))
6824 8514 : vect_record_loop_mask (loop_vinfo, masks, vec_num,
6825 : vectype, NULL);
6826 : else
6827 : {
6828 11902 : if (dump_enabled_p ())
6829 610 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6830 : "can't use a fully-masked loop because no"
6831 : " conditional operation is available.\n");
6832 11902 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
6833 : }
6834 : }
6835 :
6836 : /* Put types on constant and invariant SLP children. */
6837 536958 : if (!vect_maybe_update_slp_op_vectype (slp_op0, vectype)
6838 536912 : || !vect_maybe_update_slp_op_vectype (slp_op1, vectype)
6839 1073777 : || !vect_maybe_update_slp_op_vectype (slp_op2, vectype))
6840 : {
6841 139 : if (dump_enabled_p ())
6842 3 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6843 : "incompatible vector types for invariants\n");
6844 139 : return false;
6845 : }
6846 :
6847 536819 : SLP_TREE_TYPE (slp_node) = op_vec_info_type;
6848 536819 : DUMP_VECT_SCOPE ("vectorizable_operation");
6849 536819 : vect_model_simple_cost (vinfo, 1, slp_node, cost_vec);
6850 536819 : if (using_emulated_vectors_p)
6851 : {
6852 : /* The above vect_model_simple_cost call handles constants
6853 : in the prologue and (mis-)costs one of the stmts as
6854 : vector stmt. See below for the actual lowering that will
6855 : be applied. */
6856 560 : unsigned n = vect_get_num_copies (vinfo, slp_node);
6857 560 : switch (code)
6858 : {
6859 201 : case PLUS_EXPR:
6860 201 : n *= 5;
6861 201 : break;
6862 328 : case MINUS_EXPR:
6863 328 : n *= 6;
6864 328 : break;
6865 0 : case NEGATE_EXPR:
6866 0 : n *= 4;
6867 0 : break;
6868 : default:
6869 : /* Bit operations do not have extra cost and are accounted
6870 : as vector stmt by vect_model_simple_cost. */
6871 : n = 0;
6872 : break;
6873 : }
6874 529 : if (n != 0)
6875 : {
6876 : /* We also need to materialize two large constants. */
6877 529 : record_stmt_cost (cost_vec, 2, scalar_stmt, stmt_info,
6878 : 0, vect_prologue);
6879 529 : record_stmt_cost (cost_vec, n, scalar_stmt, stmt_info,
6880 : 0, vect_body);
6881 : }
6882 : }
6883 536819 : return true;
6884 : }
6885 :
6886 : /* Transform. */
6887 :
6888 114588 : if (dump_enabled_p ())
6889 16486 : dump_printf_loc (MSG_NOTE, vect_location,
6890 : "transform binary/unary operation.\n");
6891 :
6892 114588 : bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6893 103217 : bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
6894 :
6895 : /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
6896 : vectors with unsigned elements, but the result is signed. So, we
6897 : need to compute the MINUS_EXPR into vectype temporary and
6898 : VIEW_CONVERT_EXPR it into the final vectype_out result. */
6899 114588 : tree vec_cvt_dest = NULL_TREE;
6900 114588 : if (orig_code == POINTER_DIFF_EXPR)
6901 : {
6902 110 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
6903 110 : vec_cvt_dest = vect_create_destination_var (scalar_dest, vectype_out);
6904 : }
6905 : /* For reduction operations with undefined overflow behavior make sure to
6906 : pun them to unsigned since we change the order of evaluation.
6907 : ??? Avoid for in-order reductions? */
6908 114478 : else if (arith_code_with_undefined_signed_overflow (orig_code)
6909 97852 : && ANY_INTEGRAL_TYPE_P (vectype)
6910 47782 : && TYPE_OVERFLOW_UNDEFINED (vectype)
6911 140087 : && SLP_TREE_REDUC_IDX (slp_node) != -1)
6912 : {
6913 2465 : gcc_assert (orig_code == PLUS_EXPR || orig_code == MINUS_EXPR
6914 : || orig_code == MULT_EXPR || orig_code == POINTER_PLUS_EXPR);
6915 2465 : vec_cvt_dest = vect_create_destination_var (scalar_dest, vectype_out);
6916 2465 : vectype = unsigned_type_for (vectype);
6917 2465 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
6918 : }
6919 : /* Handle def. */
6920 : else
6921 112013 : vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6922 :
6923 114588 : vect_get_vec_defs (vinfo, slp_node,
6924 : op0, &vec_oprnds0, op1, &vec_oprnds1, op2, &vec_oprnds2);
6925 : /* Arguments are ready. Create the new vector stmt. */
6926 253441 : FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
6927 : {
6928 138853 : gimple *new_stmt = NULL;
6929 277706 : vop1 = ((op_type == binary_op || op_type == ternary_op)
6930 138853 : ? vec_oprnds1[i] : NULL_TREE);
6931 138853 : vop2 = ((op_type == ternary_op) ? vec_oprnds2[i] : NULL_TREE);
6932 :
6933 138853 : if (vec_cvt_dest
6934 138853 : && !useless_type_conversion_p (vectype, TREE_TYPE (vop0)))
6935 : {
6936 2918 : new_temp = build1 (VIEW_CONVERT_EXPR, vectype, vop0);
6937 2918 : new_stmt = gimple_build_assign (vec_dest, VIEW_CONVERT_EXPR,
6938 : new_temp);
6939 2918 : new_temp = make_ssa_name (vec_dest, new_stmt);
6940 2918 : gimple_assign_set_lhs (new_stmt, new_temp);
6941 2918 : vect_finish_stmt_generation (vinfo, stmt_info,
6942 : new_stmt, gsi);
6943 2918 : vop0 = new_temp;
6944 : }
6945 138853 : if (vop1
6946 136294 : && vec_cvt_dest
6947 141896 : && !useless_type_conversion_p (vectype, TREE_TYPE (vop1)))
6948 : {
6949 2918 : new_temp = build1 (VIEW_CONVERT_EXPR, vectype, vop1);
6950 2918 : new_stmt = gimple_build_assign (vec_dest, VIEW_CONVERT_EXPR,
6951 : new_temp);
6952 2918 : new_temp = make_ssa_name (vec_dest, new_stmt);
6953 2918 : gimple_assign_set_lhs (new_stmt, new_temp);
6954 2918 : vect_finish_stmt_generation (vinfo, stmt_info,
6955 : new_stmt, gsi);
6956 2918 : vop1 = new_temp;
6957 : }
6958 138853 : if (vop2
6959 0 : && vec_cvt_dest
6960 138853 : && !useless_type_conversion_p (vectype, TREE_TYPE (vop2)))
6961 : {
6962 0 : new_temp = build1 (VIEW_CONVERT_EXPR, vectype, vop2);
6963 0 : new_stmt = gimple_build_assign (vec_dest, VIEW_CONVERT_EXPR,
6964 : new_temp);
6965 0 : new_temp = make_ssa_name (vec_dest, new_stmt);
6966 0 : gimple_assign_set_lhs (new_stmt, new_temp);
6967 0 : vect_finish_stmt_generation (vinfo, stmt_info,
6968 : new_stmt, gsi);
6969 0 : vop2 = new_temp;
6970 : }
6971 :
6972 138853 : if (using_emulated_vectors_p)
6973 : {
6974 : /* Lower the operation. This follows vector lowering. */
6975 2 : tree word_type = build_nonstandard_integer_type
6976 2 : (GET_MODE_BITSIZE (vec_mode).to_constant (), 1);
6977 2 : tree wvop0 = make_ssa_name (word_type);
6978 2 : new_stmt = gimple_build_assign (wvop0, VIEW_CONVERT_EXPR,
6979 : build1 (VIEW_CONVERT_EXPR,
6980 : word_type, vop0));
6981 2 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6982 2 : tree wvop1 = NULL_TREE;
6983 2 : if (vop1)
6984 : {
6985 2 : wvop1 = make_ssa_name (word_type);
6986 2 : new_stmt = gimple_build_assign (wvop1, VIEW_CONVERT_EXPR,
6987 : build1 (VIEW_CONVERT_EXPR,
6988 : word_type, vop1));
6989 2 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
6990 : }
6991 :
6992 2 : tree result_low;
6993 2 : if (code == PLUS_EXPR || code == MINUS_EXPR || code == NEGATE_EXPR)
6994 : {
6995 1 : unsigned int width = vector_element_bits (vectype);
6996 1 : tree inner_type = TREE_TYPE (vectype);
6997 1 : HOST_WIDE_INT max = GET_MODE_MASK (TYPE_MODE (inner_type));
6998 1 : tree low_bits
6999 1 : = build_replicated_int_cst (word_type, width, max >> 1);
7000 1 : tree high_bits
7001 2 : = build_replicated_int_cst (word_type,
7002 1 : width, max & ~(max >> 1));
7003 1 : tree signs;
7004 1 : if (code == PLUS_EXPR || code == MINUS_EXPR)
7005 : {
7006 1 : signs = make_ssa_name (word_type);
7007 1 : new_stmt = gimple_build_assign (signs,
7008 : BIT_XOR_EXPR, wvop0, wvop1);
7009 1 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7010 1 : tree b_low = make_ssa_name (word_type);
7011 1 : new_stmt = gimple_build_assign (b_low, BIT_AND_EXPR,
7012 : wvop1, low_bits);
7013 1 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7014 1 : tree a_low = make_ssa_name (word_type);
7015 1 : if (code == PLUS_EXPR)
7016 1 : new_stmt = gimple_build_assign (a_low, BIT_AND_EXPR,
7017 : wvop0, low_bits);
7018 : else
7019 0 : new_stmt = gimple_build_assign (a_low, BIT_IOR_EXPR,
7020 : wvop0, high_bits);
7021 1 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7022 1 : if (code == MINUS_EXPR)
7023 : {
7024 0 : new_stmt = gimple_build_assign (NULL_TREE,
7025 : BIT_NOT_EXPR, signs);
7026 0 : signs = make_ssa_name (word_type);
7027 0 : gimple_assign_set_lhs (new_stmt, signs);
7028 0 : vect_finish_stmt_generation (vinfo, stmt_info,
7029 : new_stmt, gsi);
7030 : }
7031 1 : new_stmt = gimple_build_assign (NULL_TREE, BIT_AND_EXPR,
7032 : signs, high_bits);
7033 1 : signs = make_ssa_name (word_type);
7034 1 : gimple_assign_set_lhs (new_stmt, signs);
7035 1 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7036 1 : result_low = make_ssa_name (word_type);
7037 1 : new_stmt = gimple_build_assign (result_low, code,
7038 : a_low, b_low);
7039 1 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7040 : }
7041 : else /* if (code == NEGATE_EXPR) */
7042 : {
7043 0 : tree a_low = make_ssa_name (word_type);
7044 0 : new_stmt = gimple_build_assign (a_low, BIT_AND_EXPR,
7045 : wvop0, low_bits);
7046 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7047 0 : signs = make_ssa_name (word_type);
7048 0 : new_stmt = gimple_build_assign (signs, BIT_NOT_EXPR, wvop0);
7049 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7050 0 : new_stmt = gimple_build_assign (NULL_TREE, BIT_AND_EXPR,
7051 : signs, high_bits);
7052 0 : signs = make_ssa_name (word_type);
7053 0 : gimple_assign_set_lhs (new_stmt, signs);
7054 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7055 0 : result_low = make_ssa_name (word_type);
7056 0 : new_stmt = gimple_build_assign (result_low,
7057 : MINUS_EXPR, high_bits, a_low);
7058 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7059 : }
7060 1 : new_stmt = gimple_build_assign (NULL_TREE, BIT_XOR_EXPR,
7061 : result_low, signs);
7062 1 : result_low = make_ssa_name (word_type);
7063 1 : gimple_assign_set_lhs (new_stmt, result_low);
7064 1 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7065 : }
7066 : else
7067 : {
7068 1 : new_stmt = gimple_build_assign (NULL_TREE, code, wvop0, wvop1);
7069 1 : result_low = make_ssa_name (word_type);
7070 1 : gimple_assign_set_lhs (new_stmt, result_low);
7071 1 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7072 :
7073 : }
7074 2 : new_stmt = gimple_build_assign (NULL_TREE, VIEW_CONVERT_EXPR,
7075 : build1 (VIEW_CONVERT_EXPR,
7076 : vectype, result_low));
7077 2 : new_temp = make_ssa_name (vectype);
7078 2 : gimple_assign_set_lhs (new_stmt, new_temp);
7079 2 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7080 : }
7081 138851 : else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
7082 : {
7083 16 : tree mask;
7084 16 : if (masked_loop_p)
7085 16 : mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7086 : vec_num, vectype, i);
7087 : else
7088 : /* Dummy mask. */
7089 0 : mask = build_minus_one_cst (truth_type_for (vectype));
7090 16 : auto_vec<tree> vops (6);
7091 16 : vops.quick_push (mask);
7092 16 : vops.quick_push (vop0);
7093 16 : if (vop1)
7094 16 : vops.quick_push (vop1);
7095 16 : if (vop2)
7096 0 : vops.quick_push (vop2);
7097 16 : if (reduc_idx >= 0)
7098 : {
7099 : /* Perform the operation on active elements only and take
7100 : inactive elements from the reduction chain input. */
7101 8 : gcc_assert (!vop2);
7102 8 : vops.quick_push (reduc_idx == 1 ? vop1 : vop0);
7103 : }
7104 : else
7105 : {
7106 8 : auto else_value = targetm.preferred_else_value
7107 8 : (cond_fn, vectype, vops.length () - 1, &vops[1]);
7108 8 : vops.quick_push (else_value);
7109 : }
7110 16 : if (len_loop_p)
7111 : {
7112 0 : tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
7113 0 : vec_num, vectype, i, 1, true);
7114 0 : signed char biasval
7115 0 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7116 0 : tree bias = build_int_cst (intQI_type_node, biasval);
7117 0 : vops.quick_push (len);
7118 0 : vops.quick_push (bias);
7119 : }
7120 16 : gcall *call
7121 16 : = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
7122 : : cond_len_fn,
7123 : vops);
7124 16 : new_temp = make_ssa_name (vec_dest, call);
7125 16 : gimple_call_set_lhs (call, new_temp);
7126 16 : gimple_call_set_nothrow (call, true);
7127 16 : vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
7128 16 : new_stmt = call;
7129 16 : }
7130 : else
7131 : {
7132 138835 : tree mask = NULL_TREE;
7133 : /* When combining two masks check if either of them is elsewhere
7134 : combined with a loop mask, if that's the case we can mark that the
7135 : new combined mask doesn't need to be combined with a loop mask. */
7136 138835 : if (masked_loop_p
7137 138835 : && code == BIT_AND_EXPR
7138 138835 : && VECTOR_BOOLEAN_TYPE_P (vectype))
7139 : {
7140 8 : if (loop_vinfo->scalar_cond_masked_set.contains ({ op0, vec_num }))
7141 : {
7142 0 : mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7143 : vec_num, vectype, i);
7144 :
7145 0 : vop0 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
7146 : vop0, gsi);
7147 : }
7148 :
7149 8 : if (loop_vinfo->scalar_cond_masked_set.contains ({ op1, vec_num }))
7150 : {
7151 0 : mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7152 : vec_num, vectype, i);
7153 :
7154 0 : vop1 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
7155 : vop1, gsi);
7156 : }
7157 : }
7158 :
7159 138835 : new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1, vop2);
7160 138835 : new_temp = make_ssa_name (vec_dest, new_stmt);
7161 138835 : gimple_assign_set_lhs (new_stmt, new_temp);
7162 138835 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
7163 138835 : if (using_emulated_vectors_p)
7164 : suppress_warning (new_stmt, OPT_Wvector_operation_performance);
7165 :
7166 : /* Enter the combined value into the vector cond hash so we don't
7167 : AND it with a loop mask again. */
7168 138835 : if (mask)
7169 0 : loop_vinfo->vec_cond_masked_set.add ({ new_temp, mask });
7170 : }
7171 :
7172 138853 : if (vec_cvt_dest)
7173 : {
7174 3043 : new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp);
7175 3043 : new_stmt = gimple_build_assign (vec_cvt_dest, VIEW_CONVERT_EXPR,
7176 : new_temp);
7177 3043 : new_temp = make_ssa_name (vec_cvt_dest, new_stmt);
7178 3043 : gimple_assign_set_lhs (new_stmt, new_temp);
7179 3043 : vect_finish_stmt_generation (vinfo, stmt_info,
7180 : new_stmt, gsi);
7181 : }
7182 :
7183 138853 : slp_node->push_vec_def (new_stmt);
7184 : }
7185 :
7186 114588 : vec_oprnds0.release ();
7187 114588 : vec_oprnds1.release ();
7188 114588 : vec_oprnds2.release ();
7189 :
7190 114588 : return true;
7191 : }
7192 :
7193 : /* A helper function to ensure data reference DR_INFO's base alignment. */
7194 :
7195 : static void
7196 1961757 : ensure_base_align (dr_vec_info *dr_info)
7197 : {
7198 : /* Alignment is only analyzed for the first element of a DR group,
7199 : use that to look at base alignment we need to enforce. */
7200 1961757 : if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt))
7201 1426245 : dr_info = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (dr_info->stmt));
7202 :
7203 1961757 : gcc_assert (dr_info->misalignment != DR_MISALIGNMENT_UNINITIALIZED);
7204 :
7205 1961757 : if (dr_info->base_misaligned)
7206 : {
7207 169233 : tree base_decl = dr_info->base_decl;
7208 :
7209 : // We should only be able to increase the alignment of a base object if
7210 : // we know what its new alignment should be at compile time.
7211 169233 : unsigned HOST_WIDE_INT align_base_to =
7212 169233 : DR_TARGET_ALIGNMENT (dr_info).to_constant () * BITS_PER_UNIT;
7213 :
7214 169233 : if (decl_in_symtab_p (base_decl))
7215 4729 : symtab_node::get (base_decl)->increase_alignment (align_base_to);
7216 164504 : else if (DECL_ALIGN (base_decl) < align_base_to)
7217 : {
7218 131511 : SET_DECL_ALIGN (base_decl, align_base_to);
7219 131511 : DECL_USER_ALIGN (base_decl) = 1;
7220 : }
7221 169233 : dr_info->base_misaligned = false;
7222 : }
7223 1961757 : }
7224 :
7225 :
7226 : /* Function get_group_alias_ptr_type.
7227 :
7228 : Return the alias type for the group starting at FIRST_STMT_INFO. */
7229 :
7230 : static tree
7231 1631582 : get_group_alias_ptr_type (stmt_vec_info first_stmt_info)
7232 : {
7233 1631582 : struct data_reference *first_dr, *next_dr;
7234 :
7235 1631582 : first_dr = STMT_VINFO_DATA_REF (first_stmt_info);
7236 1631582 : stmt_vec_info next_stmt_info = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
7237 3915464 : while (next_stmt_info)
7238 : {
7239 2416056 : next_dr = STMT_VINFO_DATA_REF (next_stmt_info);
7240 4832112 : if (get_alias_set (DR_REF (first_dr))
7241 2416056 : != get_alias_set (DR_REF (next_dr)))
7242 : {
7243 132174 : if (dump_enabled_p ())
7244 30 : dump_printf_loc (MSG_NOTE, vect_location,
7245 : "conflicting alias set types.\n");
7246 132174 : return ptr_type_node;
7247 : }
7248 2283882 : next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
7249 : }
7250 1499408 : return reference_alias_ptr_type (DR_REF (first_dr));
7251 : }
7252 :
7253 :
7254 : /* Function scan_operand_equal_p.
7255 :
7256 : Helper function for check_scan_store. Compare two references
7257 : with .GOMP_SIMD_LANE bases. */
7258 :
7259 : static bool
7260 1284 : scan_operand_equal_p (tree ref1, tree ref2)
7261 : {
7262 1284 : tree ref[2] = { ref1, ref2 };
7263 1284 : poly_int64 bitsize[2], bitpos[2];
7264 : tree offset[2], base[2];
7265 3852 : for (int i = 0; i < 2; ++i)
7266 : {
7267 2568 : machine_mode mode;
7268 2568 : int unsignedp, reversep, volatilep = 0;
7269 2568 : base[i] = get_inner_reference (ref[i], &bitsize[i], &bitpos[i],
7270 : &offset[i], &mode, &unsignedp,
7271 : &reversep, &volatilep);
7272 2568 : if (reversep || volatilep || maybe_ne (bitpos[i], 0))
7273 0 : return false;
7274 2568 : if (TREE_CODE (base[i]) == MEM_REF
7275 42 : && offset[i] == NULL_TREE
7276 2610 : && TREE_CODE (TREE_OPERAND (base[i], 0)) == SSA_NAME)
7277 : {
7278 42 : gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base[i], 0));
7279 42 : if (is_gimple_assign (def_stmt)
7280 42 : && gimple_assign_rhs_code (def_stmt) == POINTER_PLUS_EXPR
7281 42 : && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == ADDR_EXPR
7282 84 : && TREE_CODE (gimple_assign_rhs2 (def_stmt)) == SSA_NAME)
7283 : {
7284 42 : if (maybe_ne (mem_ref_offset (base[i]), 0))
7285 : return false;
7286 42 : base[i] = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
7287 42 : offset[i] = gimple_assign_rhs2 (def_stmt);
7288 : }
7289 : }
7290 : }
7291 :
7292 1284 : if (!operand_equal_p (base[0], base[1], 0))
7293 : return false;
7294 934 : if (maybe_ne (bitsize[0], bitsize[1]))
7295 : return false;
7296 934 : if (offset[0] != offset[1])
7297 : {
7298 916 : if (!offset[0] || !offset[1])
7299 : return false;
7300 916 : if (!operand_equal_p (offset[0], offset[1], 0))
7301 : {
7302 : tree step[2];
7303 0 : for (int i = 0; i < 2; ++i)
7304 : {
7305 0 : step[i] = integer_one_node;
7306 0 : if (TREE_CODE (offset[i]) == SSA_NAME)
7307 : {
7308 0 : gimple *def_stmt = SSA_NAME_DEF_STMT (offset[i]);
7309 0 : if (is_gimple_assign (def_stmt)
7310 0 : && gimple_assign_rhs_code (def_stmt) == MULT_EXPR
7311 0 : && (TREE_CODE (gimple_assign_rhs2 (def_stmt))
7312 : == INTEGER_CST))
7313 : {
7314 0 : step[i] = gimple_assign_rhs2 (def_stmt);
7315 0 : offset[i] = gimple_assign_rhs1 (def_stmt);
7316 : }
7317 : }
7318 0 : else if (TREE_CODE (offset[i]) == MULT_EXPR)
7319 : {
7320 0 : step[i] = TREE_OPERAND (offset[i], 1);
7321 0 : offset[i] = TREE_OPERAND (offset[i], 0);
7322 : }
7323 0 : tree rhs1 = NULL_TREE;
7324 0 : if (TREE_CODE (offset[i]) == SSA_NAME)
7325 : {
7326 0 : gimple *def_stmt = SSA_NAME_DEF_STMT (offset[i]);
7327 0 : if (gimple_assign_cast_p (def_stmt))
7328 0 : rhs1 = gimple_assign_rhs1 (def_stmt);
7329 : }
7330 0 : else if (CONVERT_EXPR_P (offset[i]))
7331 0 : rhs1 = TREE_OPERAND (offset[i], 0);
7332 0 : if (rhs1
7333 0 : && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
7334 0 : && INTEGRAL_TYPE_P (TREE_TYPE (offset[i]))
7335 0 : && (TYPE_PRECISION (TREE_TYPE (offset[i]))
7336 0 : >= TYPE_PRECISION (TREE_TYPE (rhs1))))
7337 0 : offset[i] = rhs1;
7338 : }
7339 0 : if (!operand_equal_p (offset[0], offset[1], 0)
7340 0 : || !operand_equal_p (step[0], step[1], 0))
7341 0 : return false;
7342 : }
7343 : }
7344 : return true;
7345 : }
7346 :
7347 :
7348 : enum scan_store_kind {
7349 : /* Normal permutation. */
7350 : scan_store_kind_perm,
7351 :
7352 : /* Whole vector left shift permutation with zero init. */
7353 : scan_store_kind_lshift_zero,
7354 :
7355 : /* Whole vector left shift permutation and VEC_COND_EXPR. */
7356 : scan_store_kind_lshift_cond
7357 : };
7358 :
7359 : /* Function check_scan_store.
7360 :
7361 : Verify if we can perform the needed permutations or whole vector shifts.
7362 : Return -1 on failure, otherwise exact log2 of vectype's nunits.
7363 : USE_WHOLE_VECTOR is a vector of enum scan_store_kind which operation
7364 : to do at each step. */
7365 :
7366 : static int
7367 1024 : scan_store_can_perm_p (tree vectype, tree init,
7368 : vec<enum scan_store_kind> *use_whole_vector = NULL)
7369 : {
7370 1024 : enum machine_mode vec_mode = TYPE_MODE (vectype);
7371 1024 : unsigned HOST_WIDE_INT nunits;
7372 1024 : if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
7373 : return -1;
7374 1024 : int units_log2 = exact_log2 (nunits);
7375 1024 : if (units_log2 <= 0)
7376 : return -1;
7377 :
7378 : int i;
7379 : enum scan_store_kind whole_vector_shift_kind = scan_store_kind_perm;
7380 4784 : for (i = 0; i <= units_log2; ++i)
7381 : {
7382 3760 : unsigned HOST_WIDE_INT j, k;
7383 3760 : enum scan_store_kind kind = scan_store_kind_perm;
7384 3760 : vec_perm_builder sel (nunits, nunits, 1);
7385 3760 : sel.quick_grow (nunits);
7386 3760 : if (i == units_log2)
7387 : {
7388 9728 : for (j = 0; j < nunits; ++j)
7389 8704 : sel[j] = nunits - 1;
7390 : }
7391 : else
7392 : {
7393 10416 : for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
7394 7680 : sel[j] = j;
7395 26416 : for (k = 0; j < nunits; ++j, ++k)
7396 23680 : sel[j] = nunits + k;
7397 : }
7398 6496 : vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits);
7399 3760 : if (!can_vec_perm_const_p (vec_mode, vec_mode, indices))
7400 : {
7401 0 : if (i == units_log2)
7402 : return -1;
7403 :
7404 0 : if (whole_vector_shift_kind == scan_store_kind_perm)
7405 : {
7406 0 : if (!can_implement_p (vec_shl_optab, vec_mode))
7407 : return -1;
7408 0 : whole_vector_shift_kind = scan_store_kind_lshift_zero;
7409 : /* Whole vector shifts shift in zeros, so if init is all zero
7410 : constant, there is no need to do anything further. */
7411 0 : if ((TREE_CODE (init) != INTEGER_CST
7412 0 : && TREE_CODE (init) != REAL_CST)
7413 0 : || !initializer_zerop (init))
7414 : {
7415 0 : tree masktype = truth_type_for (vectype);
7416 0 : if (!expand_vec_cond_expr_p (vectype, masktype))
7417 : return -1;
7418 : whole_vector_shift_kind = scan_store_kind_lshift_cond;
7419 : }
7420 : }
7421 0 : kind = whole_vector_shift_kind;
7422 : }
7423 3760 : if (use_whole_vector)
7424 : {
7425 1880 : if (kind != scan_store_kind_perm && use_whole_vector->is_empty ())
7426 0 : use_whole_vector->safe_grow_cleared (i, true);
7427 5640 : if (kind != scan_store_kind_perm || !use_whole_vector->is_empty ())
7428 0 : use_whole_vector->safe_push (kind);
7429 : }
7430 3760 : }
7431 :
7432 : return units_log2;
7433 : }
7434 :
7435 :
7436 : /* Function check_scan_store.
7437 :
7438 : Check magic stores for #pragma omp scan {in,ex}clusive reductions. */
7439 :
7440 : static bool
7441 1076 : check_scan_store (vec_info *vinfo, stmt_vec_info stmt_info, tree vectype,
7442 : enum vect_def_type rhs_dt, slp_tree slp_node,
7443 : slp_tree mask_node,
7444 : vect_memory_access_type memory_access_type)
7445 : {
7446 1076 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7447 1076 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
7448 1076 : tree ref_type;
7449 :
7450 1076 : gcc_assert (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) > 1);
7451 1076 : if (SLP_TREE_LANES (slp_node) > 1
7452 1076 : || mask_node
7453 1076 : || memory_access_type != VMAT_CONTIGUOUS
7454 1076 : || TREE_CODE (DR_BASE_ADDRESS (dr_info->dr)) != ADDR_EXPR
7455 1076 : || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0))
7456 1076 : || loop_vinfo == NULL
7457 1076 : || LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7458 1076 : || LOOP_VINFO_EPILOGUE_P (loop_vinfo)
7459 1076 : || STMT_VINFO_GROUPED_ACCESS (stmt_info)
7460 1076 : || !integer_zerop (get_dr_vinfo_offset (vinfo, dr_info))
7461 1076 : || !integer_zerop (DR_INIT (dr_info->dr))
7462 1076 : || !(ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr)))
7463 2152 : || !alias_sets_conflict_p (get_alias_set (vectype),
7464 1076 : get_alias_set (TREE_TYPE (ref_type))))
7465 : {
7466 0 : if (dump_enabled_p ())
7467 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7468 : "unsupported OpenMP scan store.\n");
7469 0 : return false;
7470 : }
7471 :
7472 : /* We need to pattern match code built by OpenMP lowering and simplified
7473 : by following optimizations into something we can handle.
7474 : #pragma omp simd reduction(inscan,+:r)
7475 : for (...)
7476 : {
7477 : r += something ();
7478 : #pragma omp scan inclusive (r)
7479 : use (r);
7480 : }
7481 : shall have body with:
7482 : // Initialization for input phase, store the reduction initializer:
7483 : _20 = .GOMP_SIMD_LANE (simduid.3_14(D), 0);
7484 : _21 = .GOMP_SIMD_LANE (simduid.3_14(D), 1);
7485 : D.2042[_21] = 0;
7486 : // Actual input phase:
7487 : ...
7488 : r.0_5 = D.2042[_20];
7489 : _6 = _4 + r.0_5;
7490 : D.2042[_20] = _6;
7491 : // Initialization for scan phase:
7492 : _25 = .GOMP_SIMD_LANE (simduid.3_14(D), 2);
7493 : _26 = D.2043[_25];
7494 : _27 = D.2042[_25];
7495 : _28 = _26 + _27;
7496 : D.2043[_25] = _28;
7497 : D.2042[_25] = _28;
7498 : // Actual scan phase:
7499 : ...
7500 : r.1_8 = D.2042[_20];
7501 : ...
7502 : The "omp simd array" variable D.2042 holds the privatized copy used
7503 : inside of the loop and D.2043 is another one that holds copies of
7504 : the current original list item. The separate GOMP_SIMD_LANE ifn
7505 : kinds are there in order to allow optimizing the initializer store
7506 : and combiner sequence, e.g. if it is originally some C++ish user
7507 : defined reduction, but allow the vectorizer to pattern recognize it
7508 : and turn into the appropriate vectorized scan.
7509 :
7510 : For exclusive scan, this is slightly different:
7511 : #pragma omp simd reduction(inscan,+:r)
7512 : for (...)
7513 : {
7514 : use (r);
7515 : #pragma omp scan exclusive (r)
7516 : r += something ();
7517 : }
7518 : shall have body with:
7519 : // Initialization for input phase, store the reduction initializer:
7520 : _20 = .GOMP_SIMD_LANE (simduid.3_14(D), 0);
7521 : _21 = .GOMP_SIMD_LANE (simduid.3_14(D), 1);
7522 : D.2042[_21] = 0;
7523 : // Actual input phase:
7524 : ...
7525 : r.0_5 = D.2042[_20];
7526 : _6 = _4 + r.0_5;
7527 : D.2042[_20] = _6;
7528 : // Initialization for scan phase:
7529 : _25 = .GOMP_SIMD_LANE (simduid.3_14(D), 3);
7530 : _26 = D.2043[_25];
7531 : D.2044[_25] = _26;
7532 : _27 = D.2042[_25];
7533 : _28 = _26 + _27;
7534 : D.2043[_25] = _28;
7535 : // Actual scan phase:
7536 : ...
7537 : r.1_8 = D.2044[_20];
7538 : ... */
7539 :
7540 1076 : if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 2)
7541 : {
7542 : /* Match the D.2042[_21] = 0; store above. Just require that
7543 : it is a constant or external definition store. */
7544 564 : if (rhs_dt != vect_constant_def && rhs_dt != vect_external_def)
7545 : {
7546 0 : fail_init:
7547 0 : if (dump_enabled_p ())
7548 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7549 : "unsupported OpenMP scan initializer store.\n");
7550 0 : return false;
7551 : }
7552 :
7553 564 : if (! loop_vinfo->scan_map)
7554 322 : loop_vinfo->scan_map = new hash_map<tree, tree>;
7555 564 : tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
7556 564 : tree &cached = loop_vinfo->scan_map->get_or_insert (var);
7557 564 : if (cached)
7558 0 : goto fail_init;
7559 564 : cached = gimple_assign_rhs1 (STMT_VINFO_STMT (stmt_info));
7560 :
7561 : /* These stores can be vectorized normally. */
7562 564 : return true;
7563 : }
7564 :
7565 512 : if (rhs_dt != vect_internal_def)
7566 : {
7567 0 : fail:
7568 0 : if (dump_enabled_p ())
7569 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7570 : "unsupported OpenMP scan combiner pattern.\n");
7571 0 : return false;
7572 : }
7573 :
7574 512 : gimple *stmt = STMT_VINFO_STMT (stmt_info);
7575 512 : tree rhs = gimple_assign_rhs1 (stmt);
7576 512 : if (TREE_CODE (rhs) != SSA_NAME)
7577 0 : goto fail;
7578 :
7579 512 : gimple *other_store_stmt = NULL;
7580 512 : tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
7581 512 : bool inscan_var_store
7582 512 : = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL;
7583 :
7584 512 : if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
7585 : {
7586 252 : if (!inscan_var_store)
7587 : {
7588 126 : use_operand_p use_p;
7589 126 : imm_use_iterator iter;
7590 378 : FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
7591 : {
7592 252 : gimple *use_stmt = USE_STMT (use_p);
7593 252 : if (use_stmt == stmt || is_gimple_debug (use_stmt))
7594 126 : continue;
7595 126 : if (gimple_bb (use_stmt) != gimple_bb (stmt)
7596 126 : || !is_gimple_assign (use_stmt)
7597 126 : || gimple_assign_rhs_class (use_stmt) != GIMPLE_BINARY_RHS
7598 126 : || other_store_stmt
7599 252 : || TREE_CODE (gimple_assign_lhs (use_stmt)) != SSA_NAME)
7600 0 : goto fail;
7601 126 : other_store_stmt = use_stmt;
7602 0 : }
7603 126 : if (other_store_stmt == NULL)
7604 0 : goto fail;
7605 126 : rhs = gimple_assign_lhs (other_store_stmt);
7606 126 : if (!single_imm_use (rhs, &use_p, &other_store_stmt))
7607 0 : goto fail;
7608 : }
7609 : }
7610 260 : else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3)
7611 : {
7612 260 : use_operand_p use_p;
7613 260 : imm_use_iterator iter;
7614 1040 : FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
7615 : {
7616 520 : gimple *use_stmt = USE_STMT (use_p);
7617 520 : if (use_stmt == stmt || is_gimple_debug (use_stmt))
7618 260 : continue;
7619 260 : if (other_store_stmt)
7620 0 : goto fail;
7621 260 : other_store_stmt = use_stmt;
7622 260 : }
7623 : }
7624 : else
7625 0 : goto fail;
7626 :
7627 512 : gimple *def_stmt = SSA_NAME_DEF_STMT (rhs);
7628 512 : if (gimple_bb (def_stmt) != gimple_bb (stmt)
7629 512 : || !is_gimple_assign (def_stmt)
7630 1024 : || gimple_assign_rhs_class (def_stmt) != GIMPLE_BINARY_RHS)
7631 0 : goto fail;
7632 :
7633 512 : enum tree_code code = gimple_assign_rhs_code (def_stmt);
7634 : /* For pointer addition, we should use the normal plus for the vector
7635 : operation. */
7636 512 : switch (code)
7637 : {
7638 0 : case POINTER_PLUS_EXPR:
7639 0 : code = PLUS_EXPR;
7640 0 : break;
7641 0 : case MULT_HIGHPART_EXPR:
7642 0 : goto fail;
7643 : default:
7644 : break;
7645 : }
7646 512 : if (TREE_CODE_LENGTH (code) != binary_op || !commutative_tree_code (code))
7647 0 : goto fail;
7648 :
7649 512 : tree rhs1 = gimple_assign_rhs1 (def_stmt);
7650 512 : tree rhs2 = gimple_assign_rhs2 (def_stmt);
7651 512 : if (TREE_CODE (rhs1) != SSA_NAME || TREE_CODE (rhs2) != SSA_NAME)
7652 0 : goto fail;
7653 :
7654 512 : gimple *load1_stmt = SSA_NAME_DEF_STMT (rhs1);
7655 512 : gimple *load2_stmt = SSA_NAME_DEF_STMT (rhs2);
7656 512 : if (gimple_bb (load1_stmt) != gimple_bb (stmt)
7657 512 : || !gimple_assign_load_p (load1_stmt)
7658 512 : || gimple_bb (load2_stmt) != gimple_bb (stmt)
7659 1024 : || !gimple_assign_load_p (load2_stmt))
7660 0 : goto fail;
7661 :
7662 512 : stmt_vec_info load1_stmt_info = loop_vinfo->lookup_stmt (load1_stmt);
7663 512 : stmt_vec_info load2_stmt_info = loop_vinfo->lookup_stmt (load2_stmt);
7664 512 : if (load1_stmt_info == NULL
7665 512 : || load2_stmt_info == NULL
7666 512 : || (STMT_VINFO_SIMD_LANE_ACCESS_P (load1_stmt_info)
7667 512 : != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info))
7668 512 : || (STMT_VINFO_SIMD_LANE_ACCESS_P (load2_stmt_info)
7669 512 : != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)))
7670 0 : goto fail;
7671 :
7672 512 : if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && inscan_var_store)
7673 : {
7674 126 : dr_vec_info *load1_dr_info = STMT_VINFO_DR_INFO (load1_stmt_info);
7675 126 : if (TREE_CODE (DR_BASE_ADDRESS (load1_dr_info->dr)) != ADDR_EXPR
7676 126 : || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0)))
7677 0 : goto fail;
7678 126 : tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0);
7679 126 : tree lrhs;
7680 126 : if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
7681 : lrhs = rhs1;
7682 : else
7683 16 : lrhs = rhs2;
7684 126 : use_operand_p use_p;
7685 126 : imm_use_iterator iter;
7686 504 : FOR_EACH_IMM_USE_FAST (use_p, iter, lrhs)
7687 : {
7688 252 : gimple *use_stmt = USE_STMT (use_p);
7689 252 : if (use_stmt == def_stmt || is_gimple_debug (use_stmt))
7690 126 : continue;
7691 126 : if (other_store_stmt)
7692 0 : goto fail;
7693 126 : other_store_stmt = use_stmt;
7694 126 : }
7695 : }
7696 :
7697 512 : if (other_store_stmt == NULL)
7698 0 : goto fail;
7699 512 : if (gimple_bb (other_store_stmt) != gimple_bb (stmt)
7700 512 : || !gimple_store_p (other_store_stmt))
7701 0 : goto fail;
7702 :
7703 512 : stmt_vec_info other_store_stmt_info
7704 512 : = loop_vinfo->lookup_stmt (other_store_stmt);
7705 512 : if (other_store_stmt_info == NULL
7706 512 : || (STMT_VINFO_SIMD_LANE_ACCESS_P (other_store_stmt_info)
7707 512 : != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)))
7708 0 : goto fail;
7709 :
7710 512 : gimple *stmt1 = stmt;
7711 512 : gimple *stmt2 = other_store_stmt;
7712 512 : if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
7713 : std::swap (stmt1, stmt2);
7714 512 : if (scan_operand_equal_p (gimple_assign_lhs (stmt1),
7715 : gimple_assign_rhs1 (load2_stmt)))
7716 : {
7717 162 : std::swap (rhs1, rhs2);
7718 162 : std::swap (load1_stmt, load2_stmt);
7719 162 : std::swap (load1_stmt_info, load2_stmt_info);
7720 : }
7721 512 : if (!scan_operand_equal_p (gimple_assign_lhs (stmt1),
7722 : gimple_assign_rhs1 (load1_stmt)))
7723 0 : goto fail;
7724 :
7725 512 : tree var3 = NULL_TREE;
7726 512 : if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3
7727 512 : && !scan_operand_equal_p (gimple_assign_lhs (stmt2),
7728 : gimple_assign_rhs1 (load2_stmt)))
7729 0 : goto fail;
7730 512 : else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
7731 : {
7732 252 : dr_vec_info *load2_dr_info = STMT_VINFO_DR_INFO (load2_stmt_info);
7733 252 : if (TREE_CODE (DR_BASE_ADDRESS (load2_dr_info->dr)) != ADDR_EXPR
7734 252 : || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0)))
7735 0 : goto fail;
7736 252 : var3 = TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0);
7737 252 : if (!lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var3))
7738 252 : || lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var3))
7739 504 : || lookup_attribute ("omp simd inscan exclusive",
7740 252 : DECL_ATTRIBUTES (var3)))
7741 0 : goto fail;
7742 : }
7743 :
7744 512 : dr_vec_info *other_dr_info = STMT_VINFO_DR_INFO (other_store_stmt_info);
7745 512 : if (TREE_CODE (DR_BASE_ADDRESS (other_dr_info->dr)) != ADDR_EXPR
7746 512 : || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (other_dr_info->dr), 0)))
7747 0 : goto fail;
7748 :
7749 512 : tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
7750 512 : tree var2 = TREE_OPERAND (DR_BASE_ADDRESS (other_dr_info->dr), 0);
7751 512 : if (!lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var1))
7752 512 : || !lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var2))
7753 1024 : || (!lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
7754 512 : == (!lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var2))))
7755 0 : goto fail;
7756 :
7757 512 : if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
7758 256 : std::swap (var1, var2);
7759 :
7760 512 : if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
7761 : {
7762 252 : if (!lookup_attribute ("omp simd inscan exclusive",
7763 252 : DECL_ATTRIBUTES (var1)))
7764 0 : goto fail;
7765 252 : var1 = var3;
7766 : }
7767 :
7768 512 : if (loop_vinfo->scan_map == NULL)
7769 0 : goto fail;
7770 512 : tree *init = loop_vinfo->scan_map->get (var1);
7771 512 : if (init == NULL)
7772 0 : goto fail;
7773 :
7774 : /* The IL is as expected, now check if we can actually vectorize it.
7775 : Inclusive scan:
7776 : _26 = D.2043[_25];
7777 : _27 = D.2042[_25];
7778 : _28 = _26 + _27;
7779 : D.2043[_25] = _28;
7780 : D.2042[_25] = _28;
7781 : should be vectorized as (where _40 is the vectorized rhs
7782 : from the D.2042[_21] = 0; store):
7783 : _30 = MEM <vector(8) int> [(int *)&D.2043];
7784 : _31 = MEM <vector(8) int> [(int *)&D.2042];
7785 : _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
7786 : _33 = _31 + _32;
7787 : // _33 = { _31[0], _31[0]+_31[1], _31[1]+_31[2], ..., _31[6]+_31[7] };
7788 : _34 = VEC_PERM_EXPR <_40, _33, { 0, 1, 8, 9, 10, 11, 12, 13 }>;
7789 : _35 = _33 + _34;
7790 : // _35 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
7791 : // _31[1]+.._31[4], ... _31[4]+.._31[7] };
7792 : _36 = VEC_PERM_EXPR <_40, _35, { 0, 1, 2, 3, 8, 9, 10, 11 }>;
7793 : _37 = _35 + _36;
7794 : // _37 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
7795 : // _31[0]+.._31[4], ... _31[0]+.._31[7] };
7796 : _38 = _30 + _37;
7797 : _39 = VEC_PERM_EXPR <_38, _38, { 7, 7, 7, 7, 7, 7, 7, 7 }>;
7798 : MEM <vector(8) int> [(int *)&D.2043] = _39;
7799 : MEM <vector(8) int> [(int *)&D.2042] = _38;
7800 : Exclusive scan:
7801 : _26 = D.2043[_25];
7802 : D.2044[_25] = _26;
7803 : _27 = D.2042[_25];
7804 : _28 = _26 + _27;
7805 : D.2043[_25] = _28;
7806 : should be vectorized as (where _40 is the vectorized rhs
7807 : from the D.2042[_21] = 0; store):
7808 : _30 = MEM <vector(8) int> [(int *)&D.2043];
7809 : _31 = MEM <vector(8) int> [(int *)&D.2042];
7810 : _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
7811 : _33 = VEC_PERM_EXPR <_40, _32, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
7812 : _34 = _32 + _33;
7813 : // _34 = { 0, _31[0], _31[0]+_31[1], _31[1]+_31[2], _31[2]+_31[3],
7814 : // _31[3]+_31[4], ... _31[5]+.._31[6] };
7815 : _35 = VEC_PERM_EXPR <_40, _34, { 0, 1, 8, 9, 10, 11, 12, 13 }>;
7816 : _36 = _34 + _35;
7817 : // _36 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
7818 : // _31[1]+.._31[4], ... _31[3]+.._31[6] };
7819 : _37 = VEC_PERM_EXPR <_40, _36, { 0, 1, 2, 3, 8, 9, 10, 11 }>;
7820 : _38 = _36 + _37;
7821 : // _38 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
7822 : // _31[0]+.._31[4], ... _31[0]+.._31[6] };
7823 : _39 = _30 + _38;
7824 : _50 = _31 + _39;
7825 : _51 = VEC_PERM_EXPR <_50, _50, { 7, 7, 7, 7, 7, 7, 7, 7 }>;
7826 : MEM <vector(8) int> [(int *)&D.2044] = _39;
7827 : MEM <vector(8) int> [(int *)&D.2042] = _51; */
7828 512 : enum machine_mode vec_mode = TYPE_MODE (vectype);
7829 512 : optab optab = optab_for_tree_code (code, vectype, optab_default);
7830 512 : if (!optab || !can_implement_p (optab, vec_mode))
7831 0 : goto fail;
7832 :
7833 512 : int units_log2 = scan_store_can_perm_p (vectype, *init);
7834 512 : if (units_log2 == -1)
7835 0 : goto fail;
7836 :
7837 : return true;
7838 : }
7839 :
7840 :
7841 : /* Function vectorizable_scan_store.
7842 :
7843 : Helper of vectorizable_score, arguments like on vectorizable_store.
7844 : Handle only the transformation, checking is done in check_scan_store. */
7845 :
7846 : static bool
7847 512 : vectorizable_scan_store (vec_info *vinfo, stmt_vec_info stmt_info,
7848 : slp_tree slp_node, gimple_stmt_iterator *gsi)
7849 : {
7850 512 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7851 512 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
7852 512 : tree ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr));
7853 512 : tree vectype = SLP_TREE_VECTYPE (slp_node);
7854 :
7855 512 : if (dump_enabled_p ())
7856 492 : dump_printf_loc (MSG_NOTE, vect_location,
7857 : "transform scan store.\n");
7858 :
7859 512 : gimple *stmt = STMT_VINFO_STMT (stmt_info);
7860 512 : tree rhs = gimple_assign_rhs1 (stmt);
7861 512 : gcc_assert (TREE_CODE (rhs) == SSA_NAME);
7862 :
7863 512 : tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
7864 512 : bool inscan_var_store
7865 512 : = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL;
7866 :
7867 512 : if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
7868 : {
7869 126 : use_operand_p use_p;
7870 126 : imm_use_iterator iter;
7871 252 : FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
7872 : {
7873 126 : gimple *use_stmt = USE_STMT (use_p);
7874 126 : if (use_stmt == stmt || is_gimple_debug (use_stmt))
7875 0 : continue;
7876 126 : rhs = gimple_assign_lhs (use_stmt);
7877 126 : break;
7878 126 : }
7879 : }
7880 :
7881 512 : gimple *def_stmt = SSA_NAME_DEF_STMT (rhs);
7882 512 : enum tree_code code = gimple_assign_rhs_code (def_stmt);
7883 512 : if (code == POINTER_PLUS_EXPR)
7884 0 : code = PLUS_EXPR;
7885 512 : gcc_assert (TREE_CODE_LENGTH (code) == binary_op
7886 : && commutative_tree_code (code));
7887 512 : tree rhs1 = gimple_assign_rhs1 (def_stmt);
7888 512 : tree rhs2 = gimple_assign_rhs2 (def_stmt);
7889 512 : gcc_assert (TREE_CODE (rhs1) == SSA_NAME && TREE_CODE (rhs2) == SSA_NAME);
7890 512 : gimple *load1_stmt = SSA_NAME_DEF_STMT (rhs1);
7891 512 : gimple *load2_stmt = SSA_NAME_DEF_STMT (rhs2);
7892 512 : stmt_vec_info load1_stmt_info = loop_vinfo->lookup_stmt (load1_stmt);
7893 512 : stmt_vec_info load2_stmt_info = loop_vinfo->lookup_stmt (load2_stmt);
7894 512 : dr_vec_info *load1_dr_info = STMT_VINFO_DR_INFO (load1_stmt_info);
7895 512 : dr_vec_info *load2_dr_info = STMT_VINFO_DR_INFO (load2_stmt_info);
7896 512 : tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0);
7897 512 : tree var2 = TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0);
7898 :
7899 512 : if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
7900 : {
7901 436 : std::swap (rhs1, rhs2);
7902 436 : std::swap (var1, var2);
7903 436 : std::swap (load1_dr_info, load2_dr_info);
7904 : }
7905 :
7906 512 : tree *init = loop_vinfo->scan_map->get (var1);
7907 512 : gcc_assert (init);
7908 :
7909 512 : unsigned HOST_WIDE_INT nunits;
7910 512 : if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
7911 : gcc_unreachable ();
7912 512 : auto_vec<enum scan_store_kind, 16> use_whole_vector;
7913 512 : int units_log2 = scan_store_can_perm_p (vectype, *init, &use_whole_vector);
7914 512 : gcc_assert (units_log2 > 0);
7915 512 : auto_vec<tree, 16> perms;
7916 512 : perms.quick_grow (units_log2 + 1);
7917 512 : tree zero_vec = NULL_TREE, masktype = NULL_TREE;
7918 2392 : for (int i = 0; i <= units_log2; ++i)
7919 : {
7920 1880 : unsigned HOST_WIDE_INT j, k;
7921 1880 : vec_perm_builder sel (nunits, nunits, 1);
7922 1880 : sel.quick_grow (nunits);
7923 1880 : if (i == units_log2)
7924 4864 : for (j = 0; j < nunits; ++j)
7925 4352 : sel[j] = nunits - 1;
7926 : else
7927 : {
7928 5208 : for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
7929 3840 : sel[j] = j;
7930 13208 : for (k = 0; j < nunits; ++j, ++k)
7931 11840 : sel[j] = nunits + k;
7932 : }
7933 3248 : vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits);
7934 1880 : if (!use_whole_vector.is_empty ()
7935 0 : && use_whole_vector[i] != scan_store_kind_perm)
7936 : {
7937 0 : if (zero_vec == NULL_TREE)
7938 0 : zero_vec = build_zero_cst (vectype);
7939 0 : if (masktype == NULL_TREE
7940 0 : && use_whole_vector[i] == scan_store_kind_lshift_cond)
7941 0 : masktype = truth_type_for (vectype);
7942 0 : perms[i] = vect_gen_perm_mask_any (vectype, indices);
7943 : }
7944 : else
7945 1880 : perms[i] = vect_gen_perm_mask_checked (vectype, indices);
7946 1880 : }
7947 :
7948 512 : vec_loop_lens *loop_lens
7949 512 : = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
7950 : ? &LOOP_VINFO_LENS (loop_vinfo)
7951 0 : : NULL);
7952 :
7953 512 : tree vec_oprnd1 = NULL_TREE;
7954 512 : tree vec_oprnd2 = NULL_TREE;
7955 512 : tree vec_oprnd3 = NULL_TREE;
7956 512 : tree dataref_ptr = DR_BASE_ADDRESS (dr_info->dr);
7957 512 : tree dataref_offset = build_int_cst (ref_type, 0);
7958 512 : tree bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info,
7959 : vectype, VMAT_CONTIGUOUS,
7960 : loop_lens);
7961 512 : tree ldataref_ptr = NULL_TREE;
7962 512 : tree orig = NULL_TREE;
7963 512 : if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
7964 126 : ldataref_ptr = DR_BASE_ADDRESS (load1_dr_info->dr);
7965 : /* The initialization is invariant. */
7966 512 : vec_oprnd1 = vect_init_vector (vinfo, stmt_info, *init, vectype, NULL);
7967 512 : auto_vec<tree> vec_oprnds2;
7968 512 : auto_vec<tree> vec_oprnds3;
7969 512 : if (ldataref_ptr == NULL)
7970 : {
7971 : /* We want to lookup the vector operands of the reduction, not those
7972 : of the store - for SLP we have to use the proper SLP node for the
7973 : lookup, which should be the single child of the scan store. */
7974 386 : vect_get_vec_defs (vinfo, SLP_TREE_CHILDREN (slp_node)[0],
7975 : rhs1, &vec_oprnds2, rhs2, &vec_oprnds3);
7976 : /* ??? For SLP we do not key the def on 'rhs1' or 'rhs2' but get
7977 : them in SLP child order. So we have to swap here with logic
7978 : similar to above. */
7979 386 : stmt_vec_info load
7980 386 : = SLP_TREE_SCALAR_STMTS (SLP_TREE_CHILDREN
7981 386 : (SLP_TREE_CHILDREN (slp_node)[0])[0])[0];
7982 386 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (load);
7983 386 : tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
7984 386 : if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)))
7985 820 : for (unsigned i = 0; i < vec_oprnds2.length (); ++i)
7986 494 : std::swap (vec_oprnds2[i], vec_oprnds3[i]);;
7987 : }
7988 : else
7989 126 : vect_get_vec_defs (vinfo, slp_node,
7990 : rhs2, &vec_oprnds3);
7991 1248 : for (unsigned j = 0; j < vec_oprnds3.length (); j++)
7992 : {
7993 736 : if (ldataref_ptr == NULL)
7994 554 : vec_oprnd2 = vec_oprnds2[j];
7995 736 : vec_oprnd3 = vec_oprnds3[j];
7996 736 : if (j == 0)
7997 : orig = vec_oprnd3;
7998 224 : else if (!inscan_var_store)
7999 112 : dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
8000 :
8001 736 : if (ldataref_ptr)
8002 : {
8003 182 : vec_oprnd2 = make_ssa_name (vectype);
8004 182 : tree data_ref = fold_build2 (MEM_REF, vectype,
8005 : unshare_expr (ldataref_ptr),
8006 : dataref_offset);
8007 182 : vect_copy_ref_info (data_ref, DR_REF (load1_dr_info->dr));
8008 182 : gimple *g = gimple_build_assign (vec_oprnd2, data_ref);
8009 182 : vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8010 : }
8011 :
8012 736 : tree v = vec_oprnd2;
8013 3068 : for (int i = 0; i < units_log2; ++i)
8014 : {
8015 2332 : tree new_temp = make_ssa_name (vectype);
8016 2332 : gimple *g = gimple_build_assign (new_temp, VEC_PERM_EXPR,
8017 : (zero_vec
8018 0 : && (use_whole_vector[i]
8019 0 : != scan_store_kind_perm))
8020 : ? zero_vec : vec_oprnd1, v,
8021 2332 : perms[i]);
8022 2332 : vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8023 :
8024 2332 : if (zero_vec && use_whole_vector[i] == scan_store_kind_lshift_cond)
8025 : {
8026 : /* Whole vector shift shifted in zero bits, but if *init
8027 : is not initializer_zerop, we need to replace those elements
8028 : with elements from vec_oprnd1. */
8029 0 : tree_vector_builder vb (masktype, nunits, 1);
8030 0 : for (unsigned HOST_WIDE_INT k = 0; k < nunits; ++k)
8031 0 : vb.quick_push (k < (HOST_WIDE_INT_1U << i)
8032 : ? boolean_false_node : boolean_true_node);
8033 :
8034 0 : tree new_temp2 = make_ssa_name (vectype);
8035 0 : g = gimple_build_assign (new_temp2, VEC_COND_EXPR, vb.build (),
8036 : new_temp, vec_oprnd1);
8037 0 : vect_finish_stmt_generation (vinfo, stmt_info,
8038 : g, gsi);
8039 0 : new_temp = new_temp2;
8040 0 : }
8041 :
8042 : /* For exclusive scan, perform the perms[i] permutation once
8043 : more. */
8044 2332 : if (i == 0
8045 1100 : && STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4
8046 728 : && v == vec_oprnd2)
8047 : {
8048 364 : v = new_temp;
8049 364 : --i;
8050 364 : continue;
8051 : }
8052 :
8053 1968 : tree new_temp2 = make_ssa_name (vectype);
8054 1968 : g = gimple_build_assign (new_temp2, code, v, new_temp);
8055 1968 : vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8056 :
8057 1968 : v = new_temp2;
8058 : }
8059 :
8060 736 : tree new_temp = make_ssa_name (vectype);
8061 736 : gimple *g = gimple_build_assign (new_temp, code, orig, v);
8062 736 : vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8063 :
8064 736 : tree last_perm_arg = new_temp;
8065 : /* For exclusive scan, new_temp computed above is the exclusive scan
8066 : prefix sum. Turn it into inclusive prefix sum for the broadcast
8067 : of the last element into orig. */
8068 736 : if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
8069 : {
8070 364 : last_perm_arg = make_ssa_name (vectype);
8071 364 : g = gimple_build_assign (last_perm_arg, code, new_temp, vec_oprnd2);
8072 364 : vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8073 : }
8074 :
8075 736 : orig = make_ssa_name (vectype);
8076 2208 : g = gimple_build_assign (orig, VEC_PERM_EXPR, last_perm_arg,
8077 736 : last_perm_arg, perms[units_log2]);
8078 736 : vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8079 :
8080 736 : if (!inscan_var_store)
8081 : {
8082 368 : tree data_ref = fold_build2 (MEM_REF, vectype,
8083 : unshare_expr (dataref_ptr),
8084 : dataref_offset);
8085 368 : vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
8086 368 : g = gimple_build_assign (data_ref, new_temp);
8087 368 : vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8088 : }
8089 : }
8090 :
8091 512 : if (inscan_var_store)
8092 624 : for (unsigned j = 0; j < vec_oprnds3.length (); j++)
8093 : {
8094 368 : if (j != 0)
8095 112 : dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
8096 :
8097 368 : tree data_ref = fold_build2 (MEM_REF, vectype,
8098 : unshare_expr (dataref_ptr),
8099 : dataref_offset);
8100 368 : vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
8101 368 : gimple *g = gimple_build_assign (data_ref, orig);
8102 368 : vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
8103 : }
8104 512 : return true;
8105 512 : }
8106 :
8107 :
8108 : /* Function vectorizable_store.
8109 :
8110 : Check if STMT_INFO defines a non scalar data-ref (array/pointer/structure)
8111 : that can be vectorized.
8112 : If COST_VEC is passed, calculate costs but don't change anything,
8113 : otherwise, vectorize STMT_INFO: create a vectorized stmt to replace
8114 : it, and insert it at GSI.
8115 : Return true if STMT_INFO is vectorizable in this way. */
8116 :
8117 : static bool
8118 2077735 : vectorizable_store (vec_info *vinfo,
8119 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8120 : slp_tree slp_node,
8121 : stmt_vector_for_cost *cost_vec)
8122 : {
8123 2077735 : tree data_ref;
8124 2077735 : tree vec_oprnd = NULL_TREE;
8125 2077735 : tree elem_type;
8126 2077735 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8127 2077735 : class loop *loop = NULL;
8128 2077735 : machine_mode vec_mode;
8129 2077735 : tree dummy;
8130 2077735 : enum vect_def_type rhs_dt = vect_unknown_def_type;
8131 2077735 : enum vect_def_type mask_dt = vect_unknown_def_type;
8132 2077735 : tree dataref_ptr = NULL_TREE;
8133 2077735 : tree dataref_offset = NULL_TREE;
8134 2077735 : gimple *ptr_incr = NULL;
8135 2077735 : int j;
8136 2077735 : stmt_vec_info first_stmt_info;
8137 2077735 : bool grouped_store;
8138 2077735 : unsigned int group_size, i;
8139 2077735 : unsigned int vec_num;
8140 2077735 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
8141 2077735 : tree aggr_type;
8142 2077735 : poly_uint64 vf;
8143 2077735 : vec_load_store_type vls_type;
8144 2077735 : tree ref_type;
8145 :
8146 2077735 : if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
8147 : return false;
8148 :
8149 2077735 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8150 234683 : && cost_vec)
8151 : return false;
8152 :
8153 : /* Is vectorizable store? */
8154 :
8155 1843052 : tree mask_vectype = NULL_TREE;
8156 1843052 : slp_tree mask_node = NULL;
8157 1843052 : if (gassign *assign = dyn_cast <gassign *> (stmt_info->stmt))
8158 : {
8159 1771827 : tree scalar_dest = gimple_assign_lhs (assign);
8160 1771827 : if (TREE_CODE (scalar_dest) == VIEW_CONVERT_EXPR
8161 1771827 : && is_pattern_stmt_p (stmt_info))
8162 1672 : scalar_dest = TREE_OPERAND (scalar_dest, 0);
8163 1771827 : if (TREE_CODE (scalar_dest) != ARRAY_REF
8164 1771827 : && TREE_CODE (scalar_dest) != BIT_FIELD_REF
8165 : && TREE_CODE (scalar_dest) != INDIRECT_REF
8166 : && TREE_CODE (scalar_dest) != COMPONENT_REF
8167 : && TREE_CODE (scalar_dest) != IMAGPART_EXPR
8168 : && TREE_CODE (scalar_dest) != REALPART_EXPR
8169 : && TREE_CODE (scalar_dest) != MEM_REF)
8170 : return false;
8171 : }
8172 : else
8173 : {
8174 729107 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
8175 12445 : if (!call || !gimple_call_internal_p (call))
8176 : return false;
8177 :
8178 8326 : internal_fn ifn = gimple_call_internal_fn (call);
8179 8326 : if (!internal_store_fn_p (ifn))
8180 : return false;
8181 :
8182 1899 : int mask_index = internal_fn_mask_index (ifn);
8183 1899 : if (mask_index >= 0)
8184 1899 : mask_index = vect_slp_child_index_for_operand (stmt_info, mask_index);
8185 1899 : if (mask_index >= 0
8186 1899 : && !vect_check_scalar_mask (vinfo, slp_node, mask_index,
8187 : &mask_node, &mask_dt,
8188 : &mask_vectype))
8189 : return false;
8190 : }
8191 :
8192 1361105 : tree vectype = SLP_TREE_VECTYPE (slp_node), rhs_vectype = NULL_TREE;
8193 1361105 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8194 :
8195 1361105 : if (loop_vinfo)
8196 : {
8197 226694 : loop = LOOP_VINFO_LOOP (loop_vinfo);
8198 226694 : vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8199 : }
8200 : else
8201 : vf = 1;
8202 1361105 : vec_num = vect_get_num_copies (vinfo, slp_node);
8203 :
8204 : /* FORNOW. This restriction should be relaxed. */
8205 1361105 : if (loop
8206 1361380 : && nested_in_vect_loop_p (loop, stmt_info)
8207 1361388 : && vec_num > 1)
8208 : {
8209 8 : if (dump_enabled_p ())
8210 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8211 : "multiple types in nested loop.\n");
8212 8 : return false;
8213 : }
8214 :
8215 1361097 : slp_tree op_node;
8216 1361097 : if (!vect_check_store_rhs (vinfo, stmt_info, slp_node,
8217 : &op_node, &rhs_dt, &rhs_vectype, &vls_type))
8218 : return false;
8219 :
8220 1361073 : elem_type = TREE_TYPE (vectype);
8221 1361073 : vec_mode = TYPE_MODE (vectype);
8222 :
8223 1361073 : if (!STMT_VINFO_DATA_REF (stmt_info))
8224 : return false;
8225 :
8226 1361073 : vect_load_store_data _ls_data{};
8227 1361073 : vect_load_store_data &ls = slp_node->get_data (_ls_data);
8228 1361073 : if (cost_vec
8229 1361073 : && !get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask_node,
8230 : vls_type, &_ls_data))
8231 : return false;
8232 : /* Temporary aliases to analysis data, should not be modified through
8233 : these. */
8234 1360457 : const vect_memory_access_type memory_access_type = ls.memory_access_type;
8235 1360457 : const dr_alignment_support alignment_support_scheme
8236 : = ls.alignment_support_scheme;
8237 1360457 : const int misalignment = ls.misalignment;
8238 1360457 : const poly_int64 poffset = ls.poffset;
8239 :
8240 1360457 : if (slp_node->ldst_lanes
8241 0 : && memory_access_type != VMAT_LOAD_STORE_LANES)
8242 : {
8243 0 : if (dump_enabled_p ())
8244 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8245 : "discovered store-lane but cannot use it.\n");
8246 0 : return false;
8247 : }
8248 :
8249 1360457 : if (mask_node)
8250 : {
8251 1809 : if (memory_access_type == VMAT_CONTIGUOUS)
8252 : {
8253 616 : if (!VECTOR_MODE_P (vec_mode)
8254 3086 : || !can_vec_mask_load_store_p (vec_mode,
8255 1543 : TYPE_MODE (mask_vectype), false))
8256 114 : return false;
8257 : }
8258 266 : else if (memory_access_type != VMAT_LOAD_STORE_LANES
8259 266 : && (!mat_gather_scatter_p (memory_access_type)
8260 242 : || (memory_access_type == VMAT_GATHER_SCATTER_LEGACY
8261 170 : && !VECTOR_BOOLEAN_TYPE_P (mask_vectype))))
8262 : {
8263 24 : if (dump_enabled_p ())
8264 24 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8265 : "unsupported access type for masked store.\n");
8266 24 : return false;
8267 : }
8268 242 : else if (memory_access_type == VMAT_GATHER_SCATTER_EMULATED)
8269 : {
8270 72 : if (dump_enabled_p ())
8271 24 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8272 : "unsupported masked emulated scatter.\n");
8273 72 : return false;
8274 : }
8275 : }
8276 : else
8277 : {
8278 : /* FORNOW. In some cases can vectorize even if data-type not supported
8279 : (e.g. - array initialization with 0). */
8280 1358648 : if (!can_implement_p (mov_optab, vec_mode))
8281 : return false;
8282 : }
8283 :
8284 1360247 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info), *first_dr_info = NULL;
8285 1360247 : grouped_store = (STMT_VINFO_GROUPED_ACCESS (stmt_info)
8286 2514575 : && !mat_gather_scatter_p (memory_access_type));
8287 1154328 : if (grouped_store)
8288 : {
8289 1154328 : first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8290 1154328 : first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
8291 1154328 : group_size = DR_GROUP_SIZE (first_stmt_info);
8292 : }
8293 : else
8294 : {
8295 1360247 : first_stmt_info = stmt_info;
8296 1360247 : first_dr_info = dr_info;
8297 : group_size = 1;
8298 : }
8299 :
8300 1360247 : if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) > 1 && cost_vec)
8301 : {
8302 1076 : if (!check_scan_store (vinfo, stmt_info, vectype, rhs_dt, slp_node,
8303 : mask_node, memory_access_type))
8304 : return false;
8305 : }
8306 :
8307 2719726 : bool costing_p = cost_vec;
8308 1359479 : if (costing_p) /* transformation not required. */
8309 : {
8310 814525 : if (loop_vinfo
8311 162742 : && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8312 76534 : check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node,
8313 : vls_type, group_size, &ls,
8314 : mask_node);
8315 :
8316 814525 : if (!vect_maybe_update_slp_op_vectype (op_node, vectype)
8317 814525 : || (mask_node
8318 1055 : && !vect_maybe_update_slp_op_vectype (mask_node,
8319 : mask_vectype)))
8320 : {
8321 0 : if (dump_enabled_p ())
8322 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8323 : "incompatible vector types for invariants\n");
8324 0 : return false;
8325 : }
8326 :
8327 814525 : if (dump_enabled_p ()
8328 : && memory_access_type != VMAT_ELEMENTWISE
8329 15095 : && memory_access_type != VMAT_STRIDED_SLP
8330 14419 : && memory_access_type != VMAT_INVARIANT
8331 828944 : && alignment_support_scheme != dr_aligned)
8332 4997 : dump_printf_loc (MSG_NOTE, vect_location,
8333 : "Vectorizing an unaligned access.\n");
8334 : }
8335 :
8336 : /* Transform. */
8337 :
8338 1360247 : ensure_base_align (dr_info);
8339 :
8340 1360247 : if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) >= 3)
8341 : {
8342 1024 : gcc_assert (memory_access_type == VMAT_CONTIGUOUS);
8343 1024 : gcc_assert (SLP_TREE_LANES (slp_node) == 1);
8344 1024 : if (costing_p)
8345 : {
8346 512 : unsigned int inside_cost = 0, prologue_cost = 0;
8347 512 : if (vls_type == VLS_STORE_INVARIANT)
8348 0 : prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
8349 : slp_node, 0, vect_prologue);
8350 512 : vect_get_store_cost (vinfo, stmt_info, slp_node, 1,
8351 : alignment_support_scheme, misalignment,
8352 : &inside_cost, cost_vec);
8353 :
8354 512 : if (dump_enabled_p ())
8355 492 : dump_printf_loc (MSG_NOTE, vect_location,
8356 : "vect_model_store_cost: inside_cost = %d, "
8357 : "prologue_cost = %d .\n",
8358 : inside_cost, prologue_cost);
8359 :
8360 512 : SLP_TREE_TYPE (slp_node) = store_vec_info_type;
8361 512 : slp_node->data = new vect_load_store_data (std::move (ls));
8362 :
8363 512 : return true;
8364 : }
8365 512 : return vectorizable_scan_store (vinfo, stmt_info, slp_node, gsi);
8366 : }
8367 :
8368 : /* FORNOW */
8369 1359223 : gcc_assert (!grouped_store
8370 : || !loop
8371 : || !nested_in_vect_loop_p (loop, stmt_info));
8372 :
8373 1359223 : grouped_store = false;
8374 1359223 : first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
8375 1359223 : gcc_assert (!STMT_VINFO_GROUPED_ACCESS (first_stmt_info)
8376 : || (DR_GROUP_FIRST_ELEMENT (first_stmt_info) == first_stmt_info));
8377 1359223 : first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
8378 :
8379 1359223 : ref_type = get_group_alias_ptr_type (first_stmt_info);
8380 :
8381 1359223 : if (!costing_p && dump_enabled_p ())
8382 12261 : dump_printf_loc (MSG_NOTE, vect_location, "transform store.\n");
8383 :
8384 1359223 : if (memory_access_type == VMAT_ELEMENTWISE
8385 1359223 : || memory_access_type == VMAT_STRIDED_SLP)
8386 : {
8387 29195 : unsigned inside_cost = 0, prologue_cost = 0;
8388 29195 : gimple_stmt_iterator incr_gsi;
8389 29195 : bool insert_after;
8390 29195 : tree offvar = NULL_TREE;
8391 29195 : tree ivstep;
8392 29195 : tree running_off;
8393 29195 : tree stride_base, stride_step, alias_off;
8394 29195 : tree vec_oprnd = NULL_TREE;
8395 29195 : tree dr_offset;
8396 : /* Checked by get_load_store_type. */
8397 29195 : unsigned int const_nunits = nunits.to_constant ();
8398 :
8399 29195 : gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8400 29195 : gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
8401 :
8402 29195 : dr_offset = get_dr_vinfo_offset (vinfo, first_dr_info);
8403 29195 : stride_base
8404 29195 : = fold_build_pointer_plus
8405 : (DR_BASE_ADDRESS (first_dr_info->dr),
8406 : size_binop (PLUS_EXPR,
8407 : convert_to_ptrofftype (dr_offset),
8408 : convert_to_ptrofftype (DR_INIT (first_dr_info->dr))));
8409 29195 : stride_step = fold_convert (sizetype, DR_STEP (first_dr_info->dr));
8410 :
8411 : /* For a store with loop-invariant (but other than power-of-2)
8412 : stride (i.e. not a grouped access) like so:
8413 :
8414 : for (i = 0; i < n; i += stride)
8415 : array[i] = ...;
8416 :
8417 : we generate a new induction variable and new stores from
8418 : the components of the (vectorized) rhs:
8419 :
8420 : for (j = 0; ; j += VF*stride)
8421 : vectemp = ...;
8422 : tmp1 = vectemp[0];
8423 : array[j] = tmp1;
8424 : tmp2 = vectemp[1];
8425 : array[j + stride] = tmp2;
8426 : ...
8427 : */
8428 :
8429 : /* ??? Modify local copies of alignment_support_scheme and
8430 : misalignment, but this part of analysis should be done
8431 : earlier and remembered, likewise the chosen load mode. */
8432 29195 : const dr_alignment_support tem = alignment_support_scheme;
8433 29195 : dr_alignment_support alignment_support_scheme = tem;
8434 29195 : const int tem2 = misalignment;
8435 29195 : int misalignment = tem2;
8436 :
8437 29195 : unsigned nstores = const_nunits;
8438 29195 : unsigned lnel = 1;
8439 29195 : tree ltype = elem_type;
8440 29195 : tree lvectype = vectype;
8441 29195 : HOST_WIDE_INT n = gcd (group_size, const_nunits);
8442 29195 : if (n == const_nunits)
8443 : {
8444 2939 : int mis_align = dr_misalignment (first_dr_info, vectype);
8445 : /* With VF > 1 we advance the DR by step, if that is constant
8446 : and only aligned when performed VF times, DR alignment
8447 : analysis can analyze this as aligned since it assumes
8448 : contiguous accesses. But that is not how we code generate
8449 : here, so adjust for this. */
8450 2939 : if (maybe_gt (vf, 1u)
8451 4464 : && !multiple_p (DR_STEP_ALIGNMENT (first_dr_info->dr),
8452 4235 : DR_TARGET_ALIGNMENT (first_dr_info)))
8453 229 : mis_align = -1;
8454 2939 : dr_alignment_support dr_align
8455 2939 : = vect_supportable_dr_alignment (vinfo, dr_info, vectype,
8456 : mis_align);
8457 2939 : if (dr_align == dr_aligned
8458 2939 : || dr_align == dr_unaligned_supported)
8459 : {
8460 29195 : nstores = 1;
8461 29195 : lnel = const_nunits;
8462 29195 : ltype = vectype;
8463 29195 : lvectype = vectype;
8464 29195 : alignment_support_scheme = dr_align;
8465 29195 : misalignment = mis_align;
8466 : }
8467 : }
8468 26256 : else if (n > 1)
8469 : {
8470 1967 : nstores = const_nunits / n;
8471 1967 : lnel = n;
8472 1967 : ltype = build_vector_type (elem_type, n);
8473 1967 : lvectype = vectype;
8474 1967 : int mis_align = dr_misalignment (first_dr_info, ltype);
8475 1967 : if (maybe_gt (vf, 1u)
8476 3934 : && !multiple_p (DR_STEP_ALIGNMENT (first_dr_info->dr),
8477 3292 : DR_TARGET_ALIGNMENT (first_dr_info)))
8478 642 : mis_align = -1;
8479 1967 : dr_alignment_support dr_align
8480 1967 : = vect_supportable_dr_alignment (vinfo, dr_info, ltype,
8481 : mis_align);
8482 1967 : alignment_support_scheme = dr_align;
8483 1967 : misalignment = mis_align;
8484 :
8485 : /* First check if vec_extract optab doesn't support extraction
8486 : of vector elts directly. */
8487 1967 : scalar_mode elmode = SCALAR_TYPE_MODE (elem_type);
8488 1967 : machine_mode vmode;
8489 3934 : if (!VECTOR_MODE_P (TYPE_MODE (vectype))
8490 2139 : || !related_vector_mode (TYPE_MODE (vectype), elmode,
8491 1967 : n).exists (&vmode)
8492 1773 : || (convert_optab_handler (vec_extract_optab,
8493 1773 : TYPE_MODE (vectype), vmode)
8494 : == CODE_FOR_nothing)
8495 1967 : || !(dr_align == dr_aligned
8496 172 : || dr_align == dr_unaligned_supported))
8497 : {
8498 : /* Try to avoid emitting an extract of vector elements
8499 : by performing the extracts using an integer type of the
8500 : same size, extracting from a vector of those and then
8501 : re-interpreting it as the original vector type if
8502 : supported. */
8503 1795 : unsigned lsize = n * GET_MODE_BITSIZE (elmode);
8504 1795 : unsigned int lnunits = const_nunits / n;
8505 : /* If we can't construct such a vector fall back to
8506 : element extracts from the original vector type and
8507 : element size stores. */
8508 1795 : if (int_mode_for_size (lsize, 0).exists (&elmode)
8509 1795 : && VECTOR_MODE_P (TYPE_MODE (vectype))
8510 1795 : && related_vector_mode (TYPE_MODE (vectype), elmode,
8511 1795 : lnunits).exists (&vmode)
8512 1767 : && (convert_optab_handler (vec_extract_optab,
8513 : vmode, elmode)
8514 : != CODE_FOR_nothing))
8515 : {
8516 1767 : nstores = lnunits;
8517 1767 : lnel = n;
8518 1767 : ltype = build_nonstandard_integer_type (lsize, 1);
8519 1767 : lvectype = build_vector_type (ltype, nstores);
8520 : }
8521 : /* Else fall back to vector extraction anyway.
8522 : Fewer stores are more important than avoiding spilling
8523 : of the vector we extract from. Compared to the
8524 : construction case in vectorizable_load no store-forwarding
8525 : issue exists here for reasonable archs. But only
8526 : if the store is supported. */
8527 28 : else if (!(dr_align == dr_aligned
8528 28 : || dr_align == dr_unaligned_supported))
8529 : {
8530 29195 : nstores = const_nunits;
8531 29195 : lnel = 1;
8532 29195 : ltype = elem_type;
8533 29195 : lvectype = vectype;
8534 : }
8535 : }
8536 : }
8537 :
8538 29195 : if (costing_p)
8539 : {
8540 : /* Record the decomposition type for target access during costing. */
8541 25791 : ls.ls_type = lvectype;
8542 25791 : ls.ls_eltype = ltype;
8543 : }
8544 : else
8545 3404 : gcc_assert (ls.ls_type == lvectype && ls.ls_eltype == ltype);
8546 :
8547 29195 : unsigned align;
8548 29195 : if (alignment_support_scheme == dr_aligned)
8549 1241 : align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
8550 : else
8551 27954 : align = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
8552 : /* Alignment is at most the access size if we do multiple stores. */
8553 29195 : if (nstores > 1)
8554 26256 : align = MIN (tree_to_uhwi (TYPE_SIZE_UNIT (ltype)), align);
8555 29195 : ltype = build_aligned_type (ltype, align * BITS_PER_UNIT);
8556 29195 : int ncopies = vec_num;
8557 :
8558 29195 : if (!costing_p)
8559 : {
8560 3404 : ivstep = stride_step;
8561 3404 : ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
8562 : build_int_cst (TREE_TYPE (ivstep), vf));
8563 :
8564 3404 : standard_iv_increment_position (loop, &incr_gsi, &insert_after);
8565 :
8566 3404 : stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
8567 3404 : ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
8568 3404 : create_iv (stride_base, PLUS_EXPR, ivstep, NULL, loop, &incr_gsi,
8569 : insert_after, &offvar, NULL);
8570 :
8571 3404 : stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
8572 : }
8573 :
8574 29195 : alias_off = build_int_cst (ref_type, 0);
8575 29195 : auto_vec<tree> vec_oprnds;
8576 : /* For costing some adjacent vector stores, we'd like to cost with
8577 : the total number of them once instead of cost each one by one. */
8578 29195 : unsigned int n_adjacent_stores = 0;
8579 29195 : running_off = offvar;
8580 29195 : if (!costing_p)
8581 3404 : vect_get_slp_defs (op_node, &vec_oprnds);
8582 29195 : unsigned int group_el = 0;
8583 29195 : unsigned HOST_WIDE_INT elsz
8584 29195 : = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
8585 69636 : for (j = 0; j < ncopies; j++)
8586 : {
8587 40441 : if (!costing_p)
8588 : {
8589 5265 : vec_oprnd = vec_oprnds[j];
8590 : /* Pun the vector to extract from if necessary. */
8591 5265 : if (lvectype != vectype)
8592 : {
8593 1008 : tree tem = make_ssa_name (lvectype);
8594 1008 : tree cvt = build1 (VIEW_CONVERT_EXPR, lvectype, vec_oprnd);
8595 1008 : gimple *pun = gimple_build_assign (tem, cvt);
8596 1008 : vect_finish_stmt_generation (vinfo, stmt_info, pun, gsi);
8597 1008 : vec_oprnd = tem;
8598 : }
8599 : }
8600 179845 : for (i = 0; i < nstores; i++)
8601 : {
8602 139404 : if (costing_p)
8603 : {
8604 123355 : n_adjacent_stores++;
8605 123355 : continue;
8606 : }
8607 16049 : tree newref, newoff;
8608 16049 : gimple *incr, *assign;
8609 16049 : tree size = TYPE_SIZE (ltype);
8610 : /* Extract the i'th component. */
8611 16049 : tree pos = fold_build2 (MULT_EXPR, bitsizetype,
8612 : bitsize_int (i), size);
8613 16049 : tree elem = fold_build3 (BIT_FIELD_REF, ltype, vec_oprnd,
8614 : size, pos);
8615 :
8616 16049 : elem = force_gimple_operand_gsi (gsi, elem, true, NULL_TREE, true,
8617 : GSI_SAME_STMT);
8618 :
8619 16049 : tree this_off = build_int_cst (TREE_TYPE (alias_off),
8620 16049 : group_el * elsz);
8621 16049 : newref = build2 (MEM_REF, ltype, running_off, this_off);
8622 16049 : vect_copy_ref_info (newref, DR_REF (first_dr_info->dr));
8623 :
8624 : /* And store it to *running_off. */
8625 16049 : assign = gimple_build_assign (newref, elem);
8626 16049 : vect_finish_stmt_generation (vinfo, stmt_info, assign, gsi);
8627 :
8628 16049 : group_el += lnel;
8629 16049 : if (group_el == group_size)
8630 : {
8631 14412 : newoff = copy_ssa_name (running_off, NULL);
8632 14412 : incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
8633 : running_off, stride_step);
8634 14412 : vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
8635 :
8636 14412 : running_off = newoff;
8637 14412 : group_el = 0;
8638 : }
8639 : }
8640 : }
8641 :
8642 29195 : if (costing_p)
8643 : {
8644 25791 : if (n_adjacent_stores > 0)
8645 : {
8646 : /* Take a single lane vector type store as scalar
8647 : store to avoid ICE like 110776. */
8648 25791 : if (VECTOR_TYPE_P (ltype)
8649 25791 : && maybe_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
8650 1612 : vect_get_store_cost (vinfo, stmt_info, slp_node,
8651 : n_adjacent_stores, alignment_support_scheme,
8652 : misalignment, &inside_cost, cost_vec);
8653 : else
8654 24179 : inside_cost
8655 24179 : += record_stmt_cost (cost_vec, n_adjacent_stores,
8656 : scalar_store, slp_node, 0, vect_body);
8657 : /* Only need vector deconstruction when there is more
8658 : than one store. */
8659 25791 : if (nstores > 1)
8660 23763 : inside_cost
8661 23763 : += record_stmt_cost (cost_vec, ncopies,
8662 : vec_deconstruct, slp_node, 0, vect_body);
8663 : }
8664 25791 : if (dump_enabled_p ())
8665 676 : dump_printf_loc (MSG_NOTE, vect_location,
8666 : "vect_model_store_cost: inside_cost = %d, "
8667 : "prologue_cost = %d .\n",
8668 : inside_cost, prologue_cost);
8669 :
8670 25791 : SLP_TREE_TYPE (slp_node) = store_vec_info_type;
8671 25791 : slp_node->data = new vect_load_store_data (std::move (ls));
8672 : }
8673 :
8674 29195 : return true;
8675 29195 : }
8676 :
8677 1330028 : gcc_assert (alignment_support_scheme);
8678 1330028 : vec_loop_masks *loop_masks
8679 195617 : = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8680 1330028 : ? &LOOP_VINFO_MASKS (loop_vinfo)
8681 11 : : NULL);
8682 11 : vec_loop_lens *loop_lens
8683 195617 : = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
8684 : ? &LOOP_VINFO_LENS (loop_vinfo)
8685 0 : : NULL);
8686 :
8687 : /* The vect_transform_stmt and vect_analyze_stmt will go here but there
8688 : are some difference here. We cannot enable both the lens and masks
8689 : during transform but it is allowed during analysis.
8690 : Shouldn't go with length-based approach if fully masked. */
8691 1330028 : if (cost_vec == NULL)
8692 : /* The cost_vec is NULL during transform. */
8693 541806 : gcc_assert ((!loop_lens || !loop_masks));
8694 :
8695 : /* Targets with store-lane instructions must not require explicit
8696 : realignment. vect_supportable_dr_alignment always returns either
8697 : dr_aligned or dr_unaligned_supported for masked operations. */
8698 1330028 : gcc_assert ((memory_access_type != VMAT_LOAD_STORE_LANES
8699 : && !mask_node
8700 : && !loop_masks)
8701 : || alignment_support_scheme == dr_aligned
8702 : || alignment_support_scheme == dr_unaligned_supported);
8703 :
8704 1330028 : tree offset = NULL_TREE;
8705 1330028 : if (!known_eq (poffset, 0))
8706 4651 : offset = size_int (poffset);
8707 :
8708 1330028 : tree bump;
8709 1330028 : tree vec_offset = NULL_TREE;
8710 1330028 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
8711 : {
8712 1460 : aggr_type = NULL_TREE;
8713 1460 : bump = NULL_TREE;
8714 : }
8715 1328568 : else if (mat_gather_scatter_p (memory_access_type))
8716 : {
8717 0 : aggr_type = elem_type;
8718 0 : if (!costing_p)
8719 : {
8720 0 : tree vtype = ls.ls_type ? ls.ls_type : vectype;
8721 0 : vect_get_strided_load_store_ops (stmt_info, slp_node, vtype,
8722 : ls.strided_offset_vectype,
8723 : loop_vinfo, gsi,
8724 : &bump, &vec_offset, loop_lens);
8725 : }
8726 : }
8727 : else
8728 : {
8729 1328568 : if (memory_access_type == VMAT_LOAD_STORE_LANES)
8730 0 : aggr_type = build_array_type_nelts (elem_type, group_size * nunits);
8731 : else
8732 : aggr_type = vectype;
8733 1328568 : if (!costing_p)
8734 541333 : bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
8735 : memory_access_type, loop_lens);
8736 : }
8737 :
8738 1330028 : if (loop_vinfo && mask_node && !costing_p)
8739 544 : LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
8740 :
8741 : /* In case the vectorization factor (VF) is bigger than the number
8742 : of elements that we can fit in a vectype (nunits), we have to generate
8743 : more than one vector stmt - i.e - we need to "unroll" the
8744 : vector stmt by a factor VF/nunits. */
8745 :
8746 1330028 : auto_vec<tree> dr_chain (group_size);
8747 1330028 : auto_vec<tree> vec_masks;
8748 1330028 : tree vec_mask = NULL;
8749 1330028 : auto_delete_vec<auto_vec<tree>> gvec_oprnds (group_size);
8750 5993859 : for (i = 0; i < group_size; i++)
8751 3333803 : gvec_oprnds.quick_push (new auto_vec<tree> ());
8752 :
8753 1330028 : if (memory_access_type == VMAT_LOAD_STORE_LANES)
8754 : {
8755 0 : const internal_fn lanes_ifn = ls.lanes_ifn;
8756 :
8757 0 : if (costing_p)
8758 : /* Update all incoming store operand nodes, the general handling
8759 : above only handles the mask and the first store operand node. */
8760 0 : for (slp_tree child : SLP_TREE_CHILDREN (slp_node))
8761 0 : if (child != mask_node
8762 0 : && !vect_maybe_update_slp_op_vectype (child, vectype))
8763 : {
8764 0 : if (dump_enabled_p ())
8765 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8766 : "incompatible vector types for invariants\n");
8767 0 : return false;
8768 : }
8769 0 : unsigned inside_cost = 0, prologue_cost = 0;
8770 : /* For costing some adjacent vector stores, we'd like to cost with
8771 : the total number of them once instead of cost each one by one. */
8772 0 : unsigned int n_adjacent_stores = 0;
8773 0 : int ncopies = vec_num / group_size;
8774 0 : for (j = 0; j < ncopies; j++)
8775 : {
8776 0 : if (j == 0)
8777 : {
8778 0 : if (!costing_p)
8779 : {
8780 0 : if (mask_node)
8781 : {
8782 0 : vect_get_slp_defs (mask_node, &vec_masks);
8783 0 : vec_mask = vec_masks[0];
8784 : }
8785 0 : dataref_ptr
8786 0 : = vect_create_data_ref_ptr (vinfo, first_stmt_info,
8787 : aggr_type, NULL, offset, &dummy,
8788 : gsi, &ptr_incr, false, bump);
8789 : }
8790 : }
8791 0 : else if (!costing_p)
8792 : {
8793 0 : gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
8794 0 : if (mask_node)
8795 0 : vec_mask = vec_masks[j];
8796 0 : dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
8797 : stmt_info, bump);
8798 : }
8799 :
8800 0 : if (costing_p)
8801 : {
8802 0 : n_adjacent_stores += group_size;
8803 0 : continue;
8804 : }
8805 :
8806 : /* Get an array into which we can store the individual vectors. */
8807 0 : tree vec_array = create_vector_array (vectype, group_size);
8808 :
8809 : /* Invalidate the current contents of VEC_ARRAY. This should
8810 : become an RTL clobber too, which prevents the vector registers
8811 : from being upward-exposed. */
8812 0 : vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
8813 :
8814 : /* Store the individual vectors into the array. */
8815 0 : for (i = 0; i < group_size; i++)
8816 : {
8817 0 : slp_tree child;
8818 0 : if (i == 0 || !mask_node)
8819 0 : child = SLP_TREE_CHILDREN (slp_node)[i];
8820 : else
8821 0 : child = SLP_TREE_CHILDREN (slp_node)[i + 1];
8822 0 : vec_oprnd = SLP_TREE_VEC_DEFS (child)[j];
8823 0 : write_vector_array (vinfo, stmt_info, gsi, vec_oprnd, vec_array,
8824 : i);
8825 : }
8826 :
8827 0 : tree final_mask = NULL;
8828 0 : tree final_len = NULL;
8829 0 : tree bias = NULL;
8830 0 : if (loop_masks)
8831 0 : final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
8832 : ncopies, vectype, j);
8833 0 : if (vec_mask)
8834 0 : final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
8835 : vec_mask, gsi);
8836 :
8837 0 : if (lanes_ifn == IFN_MASK_LEN_STORE_LANES)
8838 : {
8839 0 : if (loop_lens)
8840 0 : final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
8841 : ncopies, vectype, j, 1, true);
8842 : else
8843 0 : final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
8844 0 : signed char biasval
8845 0 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
8846 0 : bias = build_int_cst (intQI_type_node, biasval);
8847 0 : if (!final_mask)
8848 : {
8849 0 : mask_vectype = truth_type_for (vectype);
8850 0 : final_mask = build_minus_one_cst (mask_vectype);
8851 : }
8852 : }
8853 :
8854 0 : gcall *call;
8855 0 : if (final_len && final_mask)
8856 : {
8857 : /* Emit:
8858 : MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
8859 : LEN, BIAS, VEC_ARRAY). */
8860 0 : unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
8861 0 : tree alias_ptr = build_int_cst (ref_type, align);
8862 0 : call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
8863 : dataref_ptr, alias_ptr,
8864 : final_mask, final_len, bias,
8865 : vec_array);
8866 : }
8867 0 : else if (final_mask)
8868 : {
8869 : /* Emit:
8870 : MASK_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
8871 : VEC_ARRAY). */
8872 0 : unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
8873 0 : tree alias_ptr = build_int_cst (ref_type, align);
8874 0 : call = gimple_build_call_internal (IFN_MASK_STORE_LANES, 4,
8875 : dataref_ptr, alias_ptr,
8876 : final_mask, vec_array);
8877 : }
8878 : else
8879 : {
8880 : /* Emit:
8881 : MEM_REF[...all elements...] = STORE_LANES (VEC_ARRAY). */
8882 0 : data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
8883 0 : call = gimple_build_call_internal (IFN_STORE_LANES, 1, vec_array);
8884 0 : gimple_call_set_lhs (call, data_ref);
8885 : }
8886 0 : gimple_call_set_nothrow (call, true);
8887 0 : vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
8888 :
8889 : /* Record that VEC_ARRAY is now dead. */
8890 0 : vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
8891 : }
8892 :
8893 0 : if (costing_p)
8894 : {
8895 0 : if (n_adjacent_stores > 0)
8896 0 : vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores,
8897 : alignment_support_scheme, misalignment,
8898 : &inside_cost, cost_vec);
8899 0 : if (dump_enabled_p ())
8900 0 : dump_printf_loc (MSG_NOTE, vect_location,
8901 : "vect_model_store_cost: inside_cost = %d, "
8902 : "prologue_cost = %d .\n",
8903 : inside_cost, prologue_cost);
8904 :
8905 0 : SLP_TREE_TYPE (slp_node) = store_vec_info_type;
8906 0 : slp_node->data = new vect_load_store_data (std::move (ls));
8907 : }
8908 :
8909 0 : return true;
8910 : }
8911 :
8912 1330028 : if (mat_gather_scatter_p (memory_access_type))
8913 : {
8914 1460 : gcc_assert (!grouped_store || ls.ls_type);
8915 1460 : if (ls.ls_type)
8916 0 : vectype = ls.ls_type;
8917 1460 : auto_vec<tree> vec_offsets;
8918 1460 : unsigned int inside_cost = 0, prologue_cost = 0;
8919 1460 : int num_stmts = vec_num;
8920 3319 : for (j = 0; j < num_stmts; j++)
8921 : {
8922 1859 : gimple *new_stmt;
8923 1859 : if (j == 0)
8924 : {
8925 1460 : if (costing_p && vls_type == VLS_STORE_INVARIANT)
8926 210 : prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
8927 : slp_node, 0, vect_prologue);
8928 : else if (!costing_p)
8929 : {
8930 : /* Since the store is not grouped, DR_GROUP_SIZE is 1, and
8931 : DR_CHAIN is of size 1. */
8932 473 : gcc_assert (group_size == 1);
8933 473 : vect_get_slp_defs (op_node, gvec_oprnds[0]);
8934 473 : if (mask_node)
8935 70 : vect_get_slp_defs (mask_node, &vec_masks);
8936 :
8937 473 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
8938 473 : vect_get_gather_scatter_ops (loop, slp_node,
8939 : &dataref_ptr, &vec_offsets);
8940 : else
8941 0 : dataref_ptr
8942 0 : = vect_create_data_ref_ptr (vinfo, first_stmt_info,
8943 : aggr_type, NULL, offset,
8944 : &dummy, gsi, &ptr_incr, false,
8945 : bump);
8946 : }
8947 : }
8948 399 : else if (!costing_p)
8949 : {
8950 35 : gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
8951 35 : if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
8952 0 : dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
8953 : gsi, stmt_info, bump);
8954 : }
8955 :
8956 2577 : new_stmt = NULL;
8957 718 : if (!costing_p)
8958 : {
8959 508 : vec_oprnd = (*gvec_oprnds[0])[j];
8960 508 : if (mask_node)
8961 90 : vec_mask = vec_masks[j];
8962 : /* We should have caught mismatched types earlier. */
8963 508 : gcc_assert (ls.ls_type
8964 : || useless_type_conversion_p
8965 : (vectype, TREE_TYPE (vec_oprnd)));
8966 : }
8967 508 : tree final_mask = NULL_TREE;
8968 2367 : tree final_len = NULL_TREE;
8969 2367 : tree bias = NULL_TREE;
8970 508 : if (!costing_p)
8971 : {
8972 508 : if (loop_masks)
8973 0 : final_mask = vect_get_loop_mask (loop_vinfo, gsi,
8974 : loop_masks, num_stmts,
8975 : vectype, j);
8976 508 : if (vec_mask)
8977 90 : final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
8978 : final_mask, vec_mask, gsi);
8979 : }
8980 :
8981 1859 : unsigned align = get_object_alignment (DR_REF (first_dr_info->dr));
8982 1859 : tree alias_align_ptr = build_int_cst (ref_type, align);
8983 1859 : if (memory_access_type == VMAT_GATHER_SCATTER_IFN)
8984 : {
8985 0 : if (costing_p)
8986 : {
8987 0 : if (ls.supported_offset_vectype
8988 0 : && !tree_nop_conversion_p (ls.supported_offset_vectype,
8989 : vec_offset))
8990 0 : inside_cost
8991 0 : += record_stmt_cost (cost_vec, 1, vector_stmt,
8992 : slp_node, 0, vect_body);
8993 0 : if (ls.supported_scale)
8994 0 : inside_cost
8995 0 : += record_stmt_cost (cost_vec, 1, vector_stmt,
8996 : slp_node, 0, vect_body);
8997 :
8998 0 : unsigned int cnunits = vect_nunits_for_cost (vectype);
8999 0 : inside_cost
9000 0 : += record_stmt_cost (cost_vec, cnunits, scalar_store,
9001 : slp_node, 0, vect_body);
9002 1859 : continue;
9003 0 : }
9004 :
9005 0 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
9006 0 : vec_offset = vec_offsets[j];
9007 :
9008 0 : tree scale = size_int (SLP_TREE_GS_SCALE (slp_node));
9009 0 : bool strided = !VECTOR_TYPE_P (TREE_TYPE (vec_offset));
9010 :
9011 : /* Perform the offset conversion and scaling if necessary. */
9012 0 : if (!strided
9013 0 : && (ls.supported_offset_vectype || ls.supported_scale))
9014 : {
9015 0 : gimple_seq stmts = NULL;
9016 0 : if (ls.supported_offset_vectype)
9017 0 : vec_offset = gimple_convert
9018 0 : (&stmts, ls.supported_offset_vectype, vec_offset);
9019 0 : if (ls.supported_scale)
9020 : {
9021 : /* Only scale the vec_offset if we haven't already. */
9022 0 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
9023 0 : || j == 0)
9024 : {
9025 0 : tree mult_cst = build_int_cst
9026 0 : (TREE_TYPE (TREE_TYPE (vec_offset)),
9027 0 : SLP_TREE_GS_SCALE (slp_node) / ls.supported_scale);
9028 0 : tree mult = build_vector_from_val
9029 0 : (TREE_TYPE (vec_offset), mult_cst);
9030 0 : vec_offset = gimple_build
9031 0 : (&stmts, MULT_EXPR, TREE_TYPE (vec_offset),
9032 : vec_offset, mult);
9033 : }
9034 0 : scale = size_int (ls.supported_scale);
9035 : }
9036 0 : gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
9037 : }
9038 :
9039 0 : if (ls.gs.ifn == IFN_MASK_LEN_SCATTER_STORE)
9040 : {
9041 0 : if (loop_lens)
9042 0 : final_len = vect_get_loop_len (loop_vinfo, gsi,
9043 : loop_lens, num_stmts,
9044 : vectype, j, 1, true);
9045 : else
9046 0 : final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
9047 :
9048 0 : signed char biasval
9049 0 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
9050 0 : bias = build_int_cst (intQI_type_node, biasval);
9051 0 : if (!final_mask)
9052 : {
9053 0 : mask_vectype = truth_type_for (vectype);
9054 0 : final_mask = build_minus_one_cst (mask_vectype);
9055 : }
9056 : }
9057 :
9058 0 : if (ls.ls_type)
9059 : {
9060 0 : gimple *conv_stmt
9061 0 : = gimple_build_assign (make_ssa_name (vectype),
9062 : VIEW_CONVERT_EXPR,
9063 : build1 (VIEW_CONVERT_EXPR, vectype,
9064 : vec_oprnd));
9065 0 : vect_finish_stmt_generation (vinfo, stmt_info, conv_stmt,
9066 : gsi);
9067 0 : vec_oprnd = gimple_get_lhs (conv_stmt);
9068 : }
9069 :
9070 0 : gcall *call;
9071 0 : if (final_len && final_mask)
9072 : {
9073 0 : if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
9074 0 : call = gimple_build_call_internal (
9075 : IFN_MASK_LEN_SCATTER_STORE, 8, dataref_ptr,
9076 : alias_align_ptr,
9077 : vec_offset, scale, vec_oprnd, final_mask, final_len,
9078 : bias);
9079 : else
9080 : /* Non-vector offset indicates that prefer to take
9081 : MASK_LEN_STRIDED_STORE instead of the
9082 : IFN_MASK_SCATTER_STORE with direct stride arg.
9083 : Similar to the gather case we have checked the
9084 : alignment for a scatter already and assume
9085 : that the strided store has the same requirements. */
9086 0 : call = gimple_build_call_internal (
9087 : IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
9088 : vec_offset, vec_oprnd, final_mask, final_len, bias);
9089 : }
9090 0 : else if (final_mask)
9091 0 : call = gimple_build_call_internal
9092 0 : (IFN_MASK_SCATTER_STORE, 6, dataref_ptr,
9093 : alias_align_ptr,
9094 : vec_offset, scale, vec_oprnd, final_mask);
9095 : else
9096 0 : call = gimple_build_call_internal (IFN_SCATTER_STORE, 5,
9097 : dataref_ptr,
9098 : alias_align_ptr,
9099 : vec_offset,
9100 : scale, vec_oprnd);
9101 0 : gimple_call_set_nothrow (call, true);
9102 0 : vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
9103 0 : new_stmt = call;
9104 : }
9105 1859 : else if (memory_access_type == VMAT_GATHER_SCATTER_LEGACY)
9106 : {
9107 : /* The builtin decls path for scatter is legacy, x86 only. */
9108 330 : gcc_assert (nunits.is_constant ()
9109 : && (!final_mask
9110 : || SCALAR_INT_MODE_P
9111 : (TYPE_MODE (TREE_TYPE (final_mask)))));
9112 330 : if (costing_p)
9113 : {
9114 199 : unsigned int cnunits = vect_nunits_for_cost (vectype);
9115 199 : inside_cost
9116 199 : += record_stmt_cost (cost_vec, cnunits, scalar_store,
9117 : slp_node, 0, vect_body);
9118 199 : continue;
9119 199 : }
9120 :
9121 131 : tree offset_vectype = TREE_TYPE (vec_offsets[0]);
9122 131 : poly_uint64 offset_nunits
9123 131 : = TYPE_VECTOR_SUBPARTS (offset_vectype);
9124 131 : if (known_eq (nunits, offset_nunits))
9125 : {
9126 55 : new_stmt = vect_build_one_scatter_store_call
9127 110 : (vinfo, stmt_info, slp_node, gsi,
9128 55 : ls.gs.decl, dataref_ptr, vec_offsets[j],
9129 : vec_oprnd, final_mask);
9130 55 : vect_finish_stmt_generation (vinfo, stmt_info,
9131 : new_stmt, gsi);
9132 : }
9133 76 : else if (known_eq (nunits, offset_nunits * 2))
9134 : {
9135 : /* We have a offset vector with half the number of
9136 : lanes but the builtins will store full vectype
9137 : data from the lower lanes. */
9138 30 : new_stmt = vect_build_one_scatter_store_call
9139 60 : (vinfo, stmt_info, slp_node, gsi, ls.gs.decl,
9140 30 : dataref_ptr, vec_offsets[2 * j],
9141 : vec_oprnd, final_mask);
9142 30 : vect_finish_stmt_generation (vinfo, stmt_info,
9143 : new_stmt, gsi);
9144 30 : int count = nunits.to_constant ();
9145 30 : vec_perm_builder sel (count, count, 1);
9146 30 : sel.quick_grow (count);
9147 382 : for (int i = 0; i < count; ++i)
9148 352 : sel[i] = i | (count / 2);
9149 30 : vec_perm_indices indices (sel, 2, count);
9150 30 : tree perm_mask
9151 30 : = vect_gen_perm_mask_checked (vectype, indices);
9152 30 : new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
9153 : vec_oprnd, vec_oprnd,
9154 : perm_mask);
9155 30 : vec_oprnd = make_ssa_name (vectype);
9156 30 : gimple_set_lhs (new_stmt, vec_oprnd);
9157 30 : vect_finish_stmt_generation (vinfo, stmt_info,
9158 : new_stmt, gsi);
9159 30 : if (final_mask)
9160 : {
9161 20 : new_stmt = gimple_build_assign (NULL_TREE,
9162 : VEC_UNPACK_HI_EXPR,
9163 : final_mask);
9164 20 : final_mask = make_ssa_name
9165 20 : (truth_type_for (offset_vectype));
9166 20 : gimple_set_lhs (new_stmt, final_mask);
9167 20 : vect_finish_stmt_generation (vinfo, stmt_info,
9168 : new_stmt, gsi);
9169 : }
9170 :
9171 30 : new_stmt = vect_build_one_scatter_store_call
9172 60 : (vinfo, stmt_info, slp_node, gsi, ls.gs.decl,
9173 30 : dataref_ptr, vec_offsets[2 * j + 1],
9174 : vec_oprnd, final_mask);
9175 30 : vect_finish_stmt_generation (vinfo, stmt_info,
9176 : new_stmt, gsi);
9177 30 : }
9178 46 : else if (known_eq (nunits * 2, offset_nunits))
9179 : {
9180 : /* We have a offset vector with double the number of
9181 : lanes. Select the low/high part accordingly. */
9182 46 : vec_offset = vec_offsets[j / 2];
9183 46 : if (j & 1)
9184 : {
9185 23 : int count = offset_nunits.to_constant ();
9186 23 : vec_perm_builder sel (count, count, 1);
9187 23 : sel.quick_grow (count);
9188 263 : for (int i = 0; i < count; ++i)
9189 240 : sel[i] = i | (count / 2);
9190 23 : vec_perm_indices indices (sel, 2, count);
9191 23 : tree perm_mask = vect_gen_perm_mask_checked
9192 23 : (TREE_TYPE (vec_offset), indices);
9193 23 : new_stmt = gimple_build_assign (NULL_TREE,
9194 : VEC_PERM_EXPR,
9195 : vec_offset,
9196 : vec_offset,
9197 : perm_mask);
9198 23 : vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
9199 23 : gimple_set_lhs (new_stmt, vec_offset);
9200 23 : vect_finish_stmt_generation (vinfo, stmt_info,
9201 : new_stmt, gsi);
9202 23 : }
9203 :
9204 46 : new_stmt = vect_build_one_scatter_store_call
9205 46 : (vinfo, stmt_info, slp_node, gsi,
9206 : ls.gs.decl, dataref_ptr, vec_offset,
9207 : vec_oprnd, final_mask);
9208 46 : vect_finish_stmt_generation (vinfo, stmt_info,
9209 : new_stmt, gsi);
9210 : }
9211 : else
9212 0 : gcc_unreachable ();
9213 : }
9214 : else
9215 : {
9216 : /* Emulated scatter. */
9217 1529 : gcc_assert (!final_mask);
9218 1529 : if (costing_p)
9219 : {
9220 1152 : unsigned int cnunits = vect_nunits_for_cost (vectype);
9221 : /* For emulated scatter N offset vector element extracts
9222 : (we assume the scalar scaling and ptr + offset add is
9223 : consumed by the load). */
9224 1152 : inside_cost
9225 1152 : += record_stmt_cost (cost_vec, 1, vec_deconstruct,
9226 : slp_node, 0, vect_body);
9227 : /* N scalar stores plus extracting the elements. */
9228 1152 : inside_cost
9229 1152 : += record_stmt_cost (cost_vec, 1, vec_deconstruct,
9230 : slp_node, 0, vect_body);
9231 1152 : inside_cost
9232 1152 : += record_stmt_cost (cost_vec, cnunits, scalar_store,
9233 : slp_node, 0, vect_body);
9234 1152 : continue;
9235 1152 : }
9236 :
9237 377 : tree offset_vectype = TREE_TYPE (vec_offsets[0]);
9238 377 : unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
9239 377 : unsigned HOST_WIDE_INT const_offset_nunits
9240 377 : = TYPE_VECTOR_SUBPARTS (offset_vectype).to_constant ();
9241 377 : vec<constructor_elt, va_gc> *ctor_elts;
9242 377 : vec_alloc (ctor_elts, const_nunits);
9243 377 : gimple_seq stmts = NULL;
9244 377 : tree elt_type = TREE_TYPE (vectype);
9245 377 : unsigned HOST_WIDE_INT elt_size
9246 377 : = tree_to_uhwi (TYPE_SIZE (elt_type));
9247 : /* We support offset vectors with more elements
9248 : than the data vector for now. */
9249 377 : unsigned HOST_WIDE_INT factor
9250 : = const_offset_nunits / const_nunits;
9251 377 : vec_offset = vec_offsets[j / factor];
9252 377 : unsigned elt_offset
9253 377 : = (j % factor) * const_nunits;
9254 377 : tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
9255 377 : tree scale = size_int (SLP_TREE_GS_SCALE (slp_node));
9256 377 : tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
9257 1531 : for (unsigned k = 0; k < const_nunits; ++k)
9258 : {
9259 : /* Compute the offsetted pointer. */
9260 1154 : tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
9261 : bitsize_int (k + elt_offset));
9262 1154 : tree idx
9263 2308 : = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
9264 1154 : vec_offset, TYPE_SIZE (idx_type), boff);
9265 1154 : idx = gimple_convert (&stmts, sizetype, idx);
9266 1154 : idx = gimple_build (&stmts, MULT_EXPR, sizetype,
9267 : idx, scale);
9268 1154 : tree ptr
9269 1154 : = gimple_build (&stmts, PLUS_EXPR,
9270 1154 : TREE_TYPE (dataref_ptr),
9271 : dataref_ptr, idx);
9272 1154 : ptr = gimple_convert (&stmts, ptr_type_node, ptr);
9273 : /* Extract the element to be stored. */
9274 1154 : tree elt
9275 2308 : = gimple_build (&stmts, BIT_FIELD_REF,
9276 1154 : TREE_TYPE (vectype),
9277 1154 : vec_oprnd, TYPE_SIZE (elt_type),
9278 1154 : bitsize_int (k * elt_size));
9279 1154 : gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
9280 1154 : stmts = NULL;
9281 1154 : tree ref
9282 1154 : = build2 (MEM_REF, ltype, ptr,
9283 : build_int_cst (ref_type, 0));
9284 1154 : new_stmt = gimple_build_assign (ref, elt);
9285 1154 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
9286 : }
9287 :
9288 377 : slp_node->push_vec_def (new_stmt);
9289 : }
9290 : }
9291 :
9292 1460 : if (costing_p)
9293 : {
9294 987 : if (dump_enabled_p ())
9295 78 : dump_printf_loc (MSG_NOTE, vect_location,
9296 : "vect_model_store_cost: inside_cost = %d, "
9297 : "prologue_cost = %d .\n",
9298 : inside_cost, prologue_cost);
9299 987 : SLP_TREE_TYPE (slp_node) = store_vec_info_type;
9300 987 : slp_node->data = new vect_load_store_data (std::move (ls));
9301 : }
9302 :
9303 1460 : return true;
9304 1460 : }
9305 :
9306 1328568 : gcc_assert (memory_access_type == VMAT_CONTIGUOUS
9307 : || memory_access_type == VMAT_CONTIGUOUS_DOWN
9308 : || memory_access_type == VMAT_CONTIGUOUS_REVERSE);
9309 :
9310 1328568 : unsigned inside_cost = 0, prologue_cost = 0;
9311 : /* For costing some adjacent vector stores, we'd like to cost with
9312 : the total number of them once instead of cost each one by one. */
9313 1328568 : unsigned int n_adjacent_stores = 0;
9314 1328568 : auto_vec<tree> result_chain (group_size);
9315 1328568 : auto_vec<tree, 1> vec_oprnds;
9316 1328568 : gimple *new_stmt;
9317 1328568 : if (!costing_p)
9318 : {
9319 : /* Get vectorized arguments for SLP_NODE. */
9320 541333 : vect_get_slp_defs (op_node, &vec_oprnds);
9321 541333 : vec_oprnd = vec_oprnds[0];
9322 541333 : if (mask_node)
9323 : {
9324 474 : vect_get_slp_defs (mask_node, &vec_masks);
9325 474 : vec_mask = vec_masks[0];
9326 : }
9327 : }
9328 :
9329 : /* We should have caught mismatched types earlier. */
9330 541333 : gcc_assert (costing_p
9331 : || useless_type_conversion_p (vectype, TREE_TYPE (vec_oprnd)));
9332 1328568 : bool simd_lane_access_p
9333 1328568 : = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) != 0;
9334 1328568 : if (!costing_p
9335 1328568 : && simd_lane_access_p
9336 4374 : && !loop_masks
9337 4374 : && TREE_CODE (DR_BASE_ADDRESS (first_dr_info->dr)) == ADDR_EXPR
9338 4374 : && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (first_dr_info->dr), 0))
9339 4374 : && integer_zerop (get_dr_vinfo_offset (vinfo, first_dr_info))
9340 4374 : && integer_zerop (DR_INIT (first_dr_info->dr))
9341 1332942 : && alias_sets_conflict_p (get_alias_set (aggr_type),
9342 4374 : get_alias_set (TREE_TYPE (ref_type))))
9343 : {
9344 4366 : dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr));
9345 4366 : dataref_offset = build_int_cst (ref_type, 0);
9346 : }
9347 1324202 : else if (!costing_p)
9348 1073926 : dataref_ptr = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
9349 : simd_lane_access_p ? loop : NULL,
9350 : offset, &dummy, gsi, &ptr_incr,
9351 : simd_lane_access_p, bump);
9352 :
9353 1328568 : new_stmt = NULL;
9354 1328568 : gcc_assert (!grouped_store);
9355 2953517 : for (i = 0; i < vec_num; i++)
9356 : {
9357 1624949 : if (!costing_p)
9358 670460 : vec_oprnd = vec_oprnds[i];
9359 :
9360 1624949 : if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
9361 : {
9362 3331 : if (costing_p)
9363 2191 : inside_cost += record_stmt_cost (cost_vec, 1, vec_perm,
9364 : slp_node, 0, vect_body);
9365 : else
9366 : {
9367 1140 : tree perm_mask = perm_mask_for_reverse (vectype);
9368 1140 : tree new_temp = make_ssa_name (vectype);
9369 :
9370 : /* Generate the permute statement. */
9371 1140 : gimple *perm_stmt
9372 1140 : = gimple_build_assign (new_temp, VEC_PERM_EXPR, vec_oprnd,
9373 : vec_oprnd, perm_mask);
9374 1140 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
9375 :
9376 1140 : perm_stmt = SSA_NAME_DEF_STMT (new_temp);
9377 1624949 : vec_oprnd = new_temp;
9378 : }
9379 : }
9380 :
9381 1624949 : if (costing_p)
9382 : {
9383 954489 : n_adjacent_stores++;
9384 954489 : continue;
9385 : }
9386 :
9387 670460 : tree final_mask = NULL_TREE;
9388 670460 : tree final_len = NULL_TREE;
9389 670460 : tree bias = NULL_TREE;
9390 670460 : if (loop_masks)
9391 77 : final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
9392 : vec_num, vectype, i);
9393 670460 : if (vec_mask)
9394 695 : vec_mask = vec_masks[i];
9395 695 : if (vec_mask)
9396 695 : final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
9397 : vec_mask, gsi);
9398 :
9399 670460 : if (i > 0)
9400 : /* Bump the vector pointer. */
9401 129127 : dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
9402 : stmt_info, bump);
9403 :
9404 670460 : unsigned misalign;
9405 670460 : unsigned HOST_WIDE_INT align;
9406 670460 : align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
9407 670460 : if (alignment_support_scheme == dr_aligned)
9408 : misalign = 0;
9409 308368 : else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
9410 : {
9411 160848 : align = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
9412 160848 : misalign = 0;
9413 : }
9414 : else
9415 147520 : misalign = misalignment;
9416 670460 : if (dataref_offset == NULL_TREE
9417 665080 : && TREE_CODE (dataref_ptr) == SSA_NAME)
9418 182323 : set_ptr_info_alignment (get_ptr_info (dataref_ptr), align, misalign);
9419 670460 : align = least_bit_hwi (misalign | align);
9420 :
9421 : /* Compute IFN when LOOP_LENS or final_mask valid. */
9422 670460 : machine_mode vmode = TYPE_MODE (vectype);
9423 670460 : machine_mode new_vmode = vmode;
9424 670460 : internal_fn partial_ifn = IFN_LAST;
9425 670460 : if (loop_lens)
9426 : {
9427 0 : opt_machine_mode new_ovmode
9428 0 : = get_len_load_store_mode (vmode, false, &partial_ifn);
9429 0 : new_vmode = new_ovmode.require ();
9430 0 : unsigned factor
9431 0 : = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
9432 0 : final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
9433 : vec_num, vectype, i, factor, true);
9434 : }
9435 670460 : else if (final_mask)
9436 : {
9437 707 : if (!can_vec_mask_load_store_p (vmode,
9438 707 : TYPE_MODE (TREE_TYPE (final_mask)),
9439 : false, &partial_ifn))
9440 0 : gcc_unreachable ();
9441 : }
9442 :
9443 670460 : if (partial_ifn == IFN_MASK_LEN_STORE)
9444 : {
9445 0 : if (!final_len)
9446 : {
9447 : /* Pass VF value to 'len' argument of
9448 : MASK_LEN_STORE if LOOP_LENS is invalid. */
9449 0 : final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
9450 : }
9451 0 : if (!final_mask)
9452 : {
9453 : /* Pass all ones value to 'mask' argument of
9454 : MASK_LEN_STORE if final_mask is invalid. */
9455 0 : mask_vectype = truth_type_for (vectype);
9456 0 : final_mask = build_minus_one_cst (mask_vectype);
9457 : }
9458 : }
9459 670460 : if (final_len)
9460 : {
9461 0 : signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
9462 0 : bias = build_int_cst (intQI_type_node, biasval);
9463 : }
9464 :
9465 : /* Arguments are ready. Create the new vector stmt. */
9466 670460 : if (final_len)
9467 : {
9468 0 : gcall *call;
9469 0 : tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
9470 : /* Need conversion if it's wrapped with VnQI. */
9471 0 : if (vmode != new_vmode)
9472 : {
9473 0 : tree new_vtype
9474 0 : = build_vector_type_for_mode (unsigned_intQI_type_node,
9475 : new_vmode);
9476 0 : tree var = vect_get_new_ssa_name (new_vtype, vect_simple_var);
9477 0 : vec_oprnd = build1 (VIEW_CONVERT_EXPR, new_vtype, vec_oprnd);
9478 0 : gassign *new_stmt
9479 0 : = gimple_build_assign (var, VIEW_CONVERT_EXPR, vec_oprnd);
9480 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
9481 0 : vec_oprnd = var;
9482 : }
9483 :
9484 0 : if (partial_ifn == IFN_MASK_LEN_STORE)
9485 0 : call = gimple_build_call_internal (IFN_MASK_LEN_STORE, 6,
9486 : dataref_ptr, ptr, final_mask,
9487 : final_len, bias, vec_oprnd);
9488 : else
9489 0 : call = gimple_build_call_internal (IFN_LEN_STORE, 5,
9490 : dataref_ptr, ptr, final_len,
9491 : bias, vec_oprnd);
9492 0 : gimple_call_set_nothrow (call, true);
9493 0 : vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
9494 0 : new_stmt = call;
9495 : }
9496 670460 : else if (final_mask)
9497 : {
9498 707 : tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
9499 707 : gcall *call
9500 707 : = gimple_build_call_internal (IFN_MASK_STORE, 4, dataref_ptr,
9501 : ptr, final_mask, vec_oprnd);
9502 707 : gimple_call_set_nothrow (call, true);
9503 707 : vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
9504 707 : new_stmt = call;
9505 : }
9506 : else
9507 : {
9508 669753 : data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr,
9509 : dataref_offset ? dataref_offset
9510 : : build_int_cst (ref_type, 0));
9511 669753 : if (alignment_support_scheme == dr_aligned
9512 669753 : && align >= TYPE_ALIGN_UNIT (vectype))
9513 : ;
9514 : else
9515 307831 : TREE_TYPE (data_ref)
9516 615662 : = build_aligned_type (TREE_TYPE (data_ref),
9517 : align * BITS_PER_UNIT);
9518 669753 : vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
9519 669753 : new_stmt = gimple_build_assign (data_ref, vec_oprnd);
9520 669753 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
9521 : }
9522 : }
9523 :
9524 1328568 : if (costing_p)
9525 : {
9526 787235 : if (n_adjacent_stores > 0)
9527 787235 : vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores,
9528 : alignment_support_scheme, misalignment,
9529 : &inside_cost, cost_vec);
9530 :
9531 : /* When vectorizing a store into the function result assign
9532 : a penalty if the function returns in a multi-register location.
9533 : In this case we assume we'll end up with having to spill the
9534 : vector result and do piecewise loads as a conservative estimate. */
9535 787235 : tree base = get_base_address (STMT_VINFO_DATA_REF (stmt_info)->ref);
9536 787235 : if (base
9537 787235 : && (TREE_CODE (base) == RESULT_DECL
9538 736903 : || (DECL_P (base) && cfun_returns (base)))
9539 849239 : && !aggregate_value_p (base, cfun->decl))
9540 : {
9541 11073 : rtx reg = hard_function_value (TREE_TYPE (base), cfun->decl, 0, 1);
9542 : /* ??? Handle PARALLEL in some way. */
9543 11073 : if (REG_P (reg))
9544 : {
9545 10869 : int nregs = hard_regno_nregs (REGNO (reg), GET_MODE (reg));
9546 : /* Assume that a single reg-reg move is possible and cheap,
9547 : do not account for vector to gp register move cost. */
9548 10869 : if (nregs > 1)
9549 : {
9550 : /* Spill. */
9551 10038 : prologue_cost
9552 10038 : += record_stmt_cost (cost_vec, 1, vector_store,
9553 : slp_node, 0, vect_epilogue);
9554 : /* Loads. */
9555 10038 : prologue_cost
9556 10038 : += record_stmt_cost (cost_vec, nregs, scalar_load,
9557 : slp_node, 0, vect_epilogue);
9558 : }
9559 : }
9560 : }
9561 787235 : if (dump_enabled_p ())
9562 13849 : dump_printf_loc (MSG_NOTE, vect_location,
9563 : "vect_model_store_cost: inside_cost = %d, "
9564 : "prologue_cost = %d .\n",
9565 : inside_cost, prologue_cost);
9566 :
9567 787235 : SLP_TREE_TYPE (slp_node) = store_vec_info_type;
9568 787235 : slp_node->data = new vect_load_store_data (std::move (ls));
9569 : }
9570 :
9571 1328568 : return true;
9572 2689641 : }
9573 :
9574 : /* Given a vector type VECTYPE, turns permutation SEL into the equivalent
9575 : VECTOR_CST mask. No checks are made that the target platform supports the
9576 : mask, so callers may wish to test can_vec_perm_const_p separately, or use
9577 : vect_gen_perm_mask_checked. */
9578 :
9579 : tree
9580 61841 : vect_gen_perm_mask_any (tree vectype, const vec_perm_indices &sel)
9581 : {
9582 61841 : tree mask_type;
9583 :
9584 61841 : poly_uint64 nunits = sel.length ();
9585 61841 : gcc_assert (known_eq (nunits, TYPE_VECTOR_SUBPARTS (vectype)));
9586 :
9587 61841 : mask_type = build_vector_type (ssizetype, nunits);
9588 61841 : return vec_perm_indices_to_tree (mask_type, sel);
9589 : }
9590 :
9591 : /* Checked version of vect_gen_perm_mask_any. Asserts can_vec_perm_const_p,
9592 : i.e. that the target supports the pattern _for arbitrary input vectors_. */
9593 :
9594 : tree
9595 58978 : vect_gen_perm_mask_checked (tree vectype, const vec_perm_indices &sel)
9596 : {
9597 58978 : machine_mode vmode = TYPE_MODE (vectype);
9598 58978 : gcc_assert (can_vec_perm_const_p (vmode, vmode, sel));
9599 58978 : return vect_gen_perm_mask_any (vectype, sel);
9600 : }
9601 :
9602 : /* Given a vector variable X and Y, that was generated for the scalar
9603 : STMT_INFO, generate instructions to permute the vector elements of X and Y
9604 : using permutation mask MASK_VEC, insert them at *GSI and return the
9605 : permuted vector variable. */
9606 :
9607 : static tree
9608 1445 : permute_vec_elements (vec_info *vinfo,
9609 : tree x, tree y, tree mask_vec, stmt_vec_info stmt_info,
9610 : gimple_stmt_iterator *gsi)
9611 : {
9612 1445 : tree vectype = TREE_TYPE (x);
9613 1445 : tree perm_dest, data_ref;
9614 1445 : gimple *perm_stmt;
9615 :
9616 1445 : tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
9617 1445 : if (scalar_dest && TREE_CODE (scalar_dest) == SSA_NAME)
9618 1445 : perm_dest = vect_create_destination_var (scalar_dest, vectype);
9619 : else
9620 0 : perm_dest = vect_get_new_vect_var (vectype, vect_simple_var, NULL);
9621 1445 : data_ref = make_ssa_name (perm_dest);
9622 :
9623 : /* Generate the permute statement. */
9624 1445 : perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, x, y, mask_vec);
9625 1445 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt, gsi);
9626 :
9627 1445 : return data_ref;
9628 : }
9629 :
9630 : /* Hoist the definitions of all SSA uses on STMT_INFO out of the loop LOOP,
9631 : inserting them on the loops preheader edge. Returns true if we
9632 : were successful in doing so (and thus STMT_INFO can be moved then),
9633 : otherwise returns false. HOIST_P indicates if we want to hoist the
9634 : definitions of all SSA uses, it would be false when we are costing. */
9635 :
9636 : static bool
9637 4024 : hoist_defs_of_uses (gimple *stmt, class loop *loop, bool hoist_p)
9638 : {
9639 4024 : ssa_op_iter i;
9640 4024 : use_operand_p use_p;
9641 4024 : auto_vec<use_operand_p, 8> to_hoist;
9642 :
9643 7645 : FOR_EACH_SSA_USE_OPERAND (use_p, stmt, i, SSA_OP_USE)
9644 : {
9645 3649 : gimple *def_stmt = SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p));
9646 3649 : if (!gimple_nop_p (def_stmt)
9647 3649 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
9648 : {
9649 : /* Make sure we don't need to recurse. While we could do
9650 : so in simple cases when there are more complex use webs
9651 : we don't have an easy way to preserve stmt order to fulfil
9652 : dependencies within them. */
9653 111 : tree op2;
9654 111 : ssa_op_iter i2;
9655 111 : if (gimple_code (def_stmt) == GIMPLE_PHI
9656 111 : || (single_ssa_def_operand (def_stmt, SSA_OP_DEF)
9657 : == NULL_DEF_OPERAND_P))
9658 28 : return false;
9659 226 : FOR_EACH_SSA_TREE_OPERAND (op2, def_stmt, i2, SSA_OP_USE)
9660 : {
9661 143 : gimple *def_stmt2 = SSA_NAME_DEF_STMT (op2);
9662 143 : if (!gimple_nop_p (def_stmt2)
9663 143 : && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt2)))
9664 : return false;
9665 : }
9666 83 : to_hoist.safe_push (use_p);
9667 : }
9668 : }
9669 :
9670 7992 : if (to_hoist.is_empty ())
9671 : return true;
9672 :
9673 59 : if (!hoist_p)
9674 : return true;
9675 :
9676 : /* Instead of moving defs we copy them so we can zero their UID to not
9677 : confuse dominance queries in the preheader. */
9678 9 : gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
9679 36 : for (use_operand_p use_p : to_hoist)
9680 : {
9681 9 : gimple *def_stmt = SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p));
9682 9 : gimple *copy = gimple_copy (def_stmt);
9683 9 : gimple_set_uid (copy, 0);
9684 9 : def_operand_p def_p = single_ssa_def_operand (def_stmt, SSA_OP_DEF);
9685 9 : tree new_def = duplicate_ssa_name (DEF_FROM_PTR (def_p), copy);
9686 9 : update_stmt (copy);
9687 9 : def_p = single_ssa_def_operand (copy, SSA_OP_DEF);
9688 9 : SET_DEF (def_p, new_def);
9689 9 : SET_USE (use_p, new_def);
9690 9 : gsi_insert_before (&gsi, copy, GSI_SAME_STMT);
9691 : }
9692 :
9693 : return true;
9694 4024 : }
9695 :
9696 : /* vectorizable_load.
9697 :
9698 : Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure)
9699 : that can be vectorized.
9700 : If COST_VEC is passed, calculate costs but don't change anything,
9701 : otherwise, vectorize STMT_INFO: create a vectorized stmt to replace
9702 : it, and insert it at GSI.
9703 : Return true if STMT_INFO is vectorizable in this way. */
9704 :
9705 : static bool
9706 2133523 : vectorizable_load (vec_info *vinfo,
9707 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
9708 : slp_tree slp_node,
9709 : stmt_vector_for_cost *cost_vec)
9710 : {
9711 2133523 : tree scalar_dest;
9712 2133523 : tree vec_dest = NULL;
9713 2133523 : tree data_ref = NULL;
9714 2133523 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
9715 2133523 : class loop *loop = NULL;
9716 2133523 : class loop *containing_loop = gimple_bb (stmt_info->stmt)->loop_father;
9717 2133523 : bool nested_in_vect_loop = false;
9718 2133523 : tree elem_type;
9719 : /* Avoid false positive uninitialized warning, see PR110652. */
9720 2133523 : tree new_temp = NULL_TREE;
9721 2133523 : machine_mode mode;
9722 2133523 : tree dummy;
9723 2133523 : tree dataref_ptr = NULL_TREE;
9724 2133523 : tree dataref_offset = NULL_TREE;
9725 2133523 : gimple *ptr_incr = NULL;
9726 2133523 : int i, j;
9727 2133523 : unsigned int group_size;
9728 2133523 : poly_uint64 group_gap_adj;
9729 2133523 : tree msq = NULL_TREE, lsq;
9730 2133523 : tree realignment_token = NULL_TREE;
9731 2133523 : gphi *phi = NULL;
9732 2133523 : bool grouped_load = false;
9733 2133523 : stmt_vec_info first_stmt_info;
9734 2133523 : stmt_vec_info first_stmt_info_for_drptr = NULL;
9735 2133523 : bool compute_in_loop = false;
9736 2133523 : class loop *at_loop;
9737 2133523 : int vec_num;
9738 2133523 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
9739 2133523 : poly_uint64 vf;
9740 2133523 : tree aggr_type;
9741 2133523 : tree ref_type;
9742 2133523 : enum vect_def_type mask_dt = vect_unknown_def_type;
9743 2133523 : enum vect_def_type els_dt = vect_unknown_def_type;
9744 :
9745 2133523 : if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
9746 : return false;
9747 :
9748 2133523 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
9749 234683 : && cost_vec)
9750 : return false;
9751 :
9752 1898840 : if (!STMT_VINFO_DATA_REF (stmt_info))
9753 : return false;
9754 :
9755 1523624 : tree mask_vectype = NULL_TREE;
9756 1523624 : tree els = NULL_TREE; tree els_vectype = NULL_TREE;
9757 :
9758 1523624 : int mask_index = -1;
9759 1523624 : int els_index = -1;
9760 1523624 : slp_tree mask_node = NULL;
9761 1523624 : slp_tree els_op = NULL;
9762 1523624 : if (gassign *assign = dyn_cast <gassign *> (stmt_info->stmt))
9763 : {
9764 1519173 : scalar_dest = gimple_assign_lhs (assign);
9765 1519173 : if (TREE_CODE (scalar_dest) != SSA_NAME)
9766 : return false;
9767 :
9768 705145 : tree_code code = gimple_assign_rhs_code (assign);
9769 705145 : if (code != ARRAY_REF
9770 705145 : && code != BIT_FIELD_REF
9771 705145 : && code != INDIRECT_REF
9772 486656 : && code != COMPONENT_REF
9773 486656 : && code != IMAGPART_EXPR
9774 351053 : && code != REALPART_EXPR
9775 351053 : && code != MEM_REF
9776 285 : && TREE_CODE_CLASS (code) != tcc_declaration)
9777 : return false;
9778 : }
9779 : else
9780 : {
9781 1430128 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
9782 4451 : if (!call || !gimple_call_internal_p (call))
9783 : return false;
9784 :
9785 4451 : internal_fn ifn = gimple_call_internal_fn (call);
9786 4451 : if (!internal_load_fn_p (ifn))
9787 : return false;
9788 :
9789 3096 : scalar_dest = gimple_call_lhs (call);
9790 3096 : if (!scalar_dest)
9791 : return false;
9792 :
9793 3096 : mask_index = internal_fn_mask_index (ifn);
9794 3096 : if (mask_index >= 0)
9795 3096 : mask_index = vect_slp_child_index_for_operand (stmt_info, mask_index);
9796 3096 : if (mask_index >= 0
9797 3096 : && !vect_check_scalar_mask (vinfo, slp_node, mask_index,
9798 : &mask_node, &mask_dt, &mask_vectype))
9799 : return false;
9800 :
9801 3096 : els_index = internal_fn_else_index (ifn);
9802 3096 : if (els_index >= 0)
9803 3096 : els_index = vect_slp_child_index_for_operand (stmt_info, els_index);
9804 3096 : if (els_index >= 0
9805 3096 : && !vect_is_simple_use (vinfo, slp_node, els_index,
9806 : &els, &els_op, &els_dt, &els_vectype))
9807 : return false;
9808 : }
9809 :
9810 708174 : tree vectype = SLP_TREE_VECTYPE (slp_node);
9811 708174 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9812 :
9813 708174 : if (loop_vinfo)
9814 : {
9815 494822 : loop = LOOP_VINFO_LOOP (loop_vinfo);
9816 494822 : nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
9817 494822 : vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9818 : }
9819 : else
9820 : vf = 1;
9821 :
9822 708174 : vec_num = vect_get_num_copies (vinfo, slp_node);
9823 :
9824 : /* FORNOW. This restriction should be relaxed. */
9825 708174 : if (nested_in_vect_loop && vec_num > 1)
9826 : {
9827 316 : if (dump_enabled_p ())
9828 66 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9829 : "multiple types in nested loop.\n");
9830 316 : return false;
9831 : }
9832 :
9833 707858 : elem_type = TREE_TYPE (vectype);
9834 707858 : mode = TYPE_MODE (vectype);
9835 :
9836 : /* FORNOW. In some cases can vectorize even if data-type not supported
9837 : (e.g. - data copies). */
9838 707858 : if (!can_implement_p (mov_optab, mode))
9839 : {
9840 0 : if (dump_enabled_p ())
9841 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9842 : "Aligned load, but unsupported type.\n");
9843 0 : return false;
9844 : }
9845 :
9846 : /* Check if the load is a part of an interleaving chain. */
9847 707858 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
9848 : {
9849 307918 : grouped_load = true;
9850 : /* FORNOW */
9851 307918 : gcc_assert (!nested_in_vect_loop);
9852 307918 : gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info));
9853 :
9854 307918 : first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
9855 307918 : group_size = DR_GROUP_SIZE (first_stmt_info);
9856 :
9857 : /* Invalidate assumptions made by dependence analysis when vectorization
9858 : on the unrolled body effectively re-orders stmts. */
9859 307918 : if (STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0
9860 307918 : && maybe_gt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9861 : STMT_VINFO_MIN_NEG_DIST (stmt_info)))
9862 : {
9863 12 : if (dump_enabled_p ())
9864 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9865 : "cannot perform implicit CSE when performing "
9866 : "group loads with negative dependence distance\n");
9867 12 : return false;
9868 : }
9869 : }
9870 : else
9871 : group_size = 1;
9872 :
9873 707846 : vect_load_store_data _ls_data{};
9874 707846 : vect_load_store_data &ls = slp_node->get_data (_ls_data);
9875 707846 : if (cost_vec
9876 707846 : && !get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask_node,
9877 : VLS_LOAD, &ls))
9878 : return false;
9879 : /* Temporary aliases to analysis data, should not be modified through
9880 : these. */
9881 602405 : const vect_memory_access_type memory_access_type = ls.memory_access_type;
9882 602405 : const dr_alignment_support alignment_support_scheme
9883 : = ls.alignment_support_scheme;
9884 602405 : const int misalignment = ls.misalignment;
9885 602405 : const poly_int64 poffset = ls.poffset;
9886 602405 : const vec<int> &elsvals = ls.elsvals;
9887 :
9888 602405 : int maskload_elsval = 0;
9889 602405 : bool need_zeroing = false;
9890 :
9891 : /* We might need to explicitly zero inactive elements if there are
9892 : padding bits in the type that might leak otherwise.
9893 : Refer to PR115336. */
9894 602405 : tree scalar_type = TREE_TYPE (scalar_dest);
9895 602405 : bool type_mode_padding_p
9896 1204810 : = TYPE_PRECISION (scalar_type) < GET_MODE_PRECISION (GET_MODE_INNER (mode));
9897 :
9898 602405 : if (slp_node->ldst_lanes
9899 0 : && memory_access_type != VMAT_LOAD_STORE_LANES)
9900 : {
9901 0 : if (dump_enabled_p ())
9902 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9903 : "discovered load-lane but cannot use it.\n");
9904 0 : return false;
9905 : }
9906 :
9907 602405 : if (mask_node)
9908 : {
9909 2966 : if (memory_access_type == VMAT_CONTIGUOUS)
9910 : {
9911 2100 : machine_mode vec_mode = TYPE_MODE (vectype);
9912 721 : if (!VECTOR_MODE_P (vec_mode)
9913 4200 : || !can_vec_mask_load_store_p (vec_mode,
9914 2100 : TYPE_MODE (mask_vectype),
9915 : true, NULL, &ls.elsvals))
9916 351 : return false;
9917 : }
9918 866 : else if (memory_access_type == VMAT_ELEMENTWISE
9919 866 : || memory_access_type == VMAT_STRIDED_SLP)
9920 : {
9921 0 : if (dump_enabled_p ())
9922 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9923 : "unsupported masked strided access.\n");
9924 0 : return false;
9925 : }
9926 866 : else if (memory_access_type != VMAT_LOAD_STORE_LANES
9927 866 : && !mat_gather_scatter_p (memory_access_type))
9928 : {
9929 62 : if (dump_enabled_p ())
9930 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9931 : "unsupported access type for masked load.\n");
9932 62 : return false;
9933 : }
9934 804 : else if (memory_access_type == VMAT_GATHER_SCATTER_EMULATED)
9935 : {
9936 482 : if (dump_enabled_p ())
9937 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9938 : "unsupported masked emulated gather.\n");
9939 482 : return false;
9940 : }
9941 : }
9942 :
9943 601510 : bool costing_p = cost_vec;
9944 :
9945 601510 : if (costing_p) /* transformation not required. */
9946 : {
9947 434833 : if (loop_vinfo
9948 313803 : && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
9949 212288 : check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node,
9950 : VLS_LOAD, group_size, &ls,
9951 : mask_node, &ls.elsvals);
9952 :
9953 : /* If the type needs padding we must zero inactive elements.
9954 : Check if we can do that with a VEC_COND_EXPR and store the
9955 : elsval we choose in MASKLOAD_ELSVAL. */
9956 434833 : if (ls.elsvals.length ()
9957 60123 : && type_mode_padding_p
9958 7 : && !ls.elsvals.contains (MASK_LOAD_ELSE_ZERO)
9959 60123 : && !expand_vec_cond_expr_p (vectype, truth_type_for (vectype)))
9960 : {
9961 0 : if (dump_enabled_p ())
9962 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9963 : "cannot zero inactive elements.\n");
9964 0 : return false;
9965 : }
9966 :
9967 434833 : if (mask_node
9968 434833 : && !vect_maybe_update_slp_op_vectype (mask_node,
9969 : mask_vectype))
9970 : {
9971 0 : if (dump_enabled_p ())
9972 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9973 : "incompatible vector types for invariants\n");
9974 0 : return false;
9975 : }
9976 :
9977 434833 : if (dump_enabled_p ()
9978 25467 : && memory_access_type != VMAT_ELEMENTWISE
9979 25356 : && !mat_gather_scatter_p (memory_access_type)
9980 25041 : && memory_access_type != VMAT_STRIDED_SLP
9981 25041 : && memory_access_type != VMAT_INVARIANT
9982 458941 : && alignment_support_scheme != dr_aligned)
9983 9923 : dump_printf_loc (MSG_NOTE, vect_location,
9984 : "Vectorizing an unaligned access.\n");
9985 :
9986 434833 : if (memory_access_type == VMAT_LOAD_STORE_LANES)
9987 0 : vinfo->any_known_not_updated_vssa = true;
9988 : }
9989 :
9990 : /* For now just use the first available else value.
9991 : get_supported_else_vals tries MASK_LOAD_ELSE_ZERO first so we will
9992 : select it here if it is supported. */
9993 601510 : if (elsvals.length ())
9994 83242 : maskload_elsval = *elsvals.begin ();
9995 :
9996 601510 : if (dump_enabled_p () && !costing_p)
9997 16661 : dump_printf_loc (MSG_NOTE, vect_location, "transform load.\n");
9998 :
9999 : /* Transform. */
10000 :
10001 601510 : dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info), *first_dr_info = NULL;
10002 601510 : ensure_base_align (dr_info);
10003 :
10004 601510 : if (memory_access_type == VMAT_INVARIANT)
10005 : {
10006 4124 : gcc_assert (!grouped_load && !mask_node && !bb_vinfo);
10007 : /* If we have versioned for aliasing or the loop doesn't
10008 : have any data dependencies that would preclude this,
10009 : then we are sure this is a loop invariant load and
10010 : thus we can insert it on the preheader edge.
10011 : TODO: hoist_defs_of_uses should ideally be computed
10012 : once at analysis time, remembered and used in the
10013 : transform time. */
10014 8248 : bool hoist_p = (LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo)
10015 4124 : && !nested_in_vect_loop);
10016 :
10017 4124 : bool uniform_p = true;
10018 17270 : for (stmt_vec_info sinfo : SLP_TREE_SCALAR_STMTS (slp_node))
10019 : {
10020 : /* It is unsafe to hoist a conditional load over the conditions that
10021 : make it valid. When early break this means that any invariant load
10022 : can't be hoisted unless it's in the loop header or if we know
10023 : something else has verified the load is valid to do. Alignment
10024 : peeling would do this since getting through the prologue means the
10025 : load was done at least once and so the vector main body is free to
10026 : hoist it. However today GCC will hoist the load above the PFA
10027 : loop. As such that makes it still invalid and so we can't allow it
10028 : today. */
10029 4898 : if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10030 1052 : && !DR_SCALAR_KNOWN_BOUNDS (STMT_VINFO_DR_INFO (sinfo))
10031 5918 : && gimple_bb (STMT_VINFO_STMT (vect_orig_stmt (sinfo)))
10032 1020 : != loop->header)
10033 : {
10034 920 : if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
10035 920 : && dump_enabled_p ())
10036 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10037 : "not hoisting invariant load due to early break"
10038 : "constraints\n");
10039 914 : else if (dump_enabled_p ())
10040 16 : dump_printf_loc (MSG_NOTE, vect_location,
10041 : "not hoisting invariant load due to early break"
10042 : "constraints\n");
10043 : hoist_p = false;
10044 : }
10045 :
10046 3978 : hoist_p = hoist_p && hoist_defs_of_uses (sinfo->stmt, loop, false);
10047 4898 : if (sinfo != SLP_TREE_SCALAR_STMTS (slp_node)[0])
10048 279 : uniform_p = false;
10049 : }
10050 4124 : if (costing_p)
10051 : {
10052 3289 : if (!uniform_p && (!hoist_p || !vf.is_constant ()))
10053 : {
10054 0 : if (dump_enabled_p ())
10055 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10056 : "not vectorizing non-uniform invariant "
10057 : "load\n");
10058 0 : return false;
10059 : }
10060 1433 : enum vect_cost_model_location cost_loc
10061 3289 : = hoist_p ? vect_prologue : vect_body;
10062 3289 : unsigned int cost = record_stmt_cost (cost_vec, 1, scalar_load,
10063 : slp_node, 0, cost_loc);
10064 3289 : cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
10065 : slp_node, 0, cost_loc);
10066 3289 : unsigned int prologue_cost = hoist_p ? cost : 0;
10067 1433 : unsigned int inside_cost = hoist_p ? 0 : cost;
10068 3289 : if (dump_enabled_p ())
10069 546 : dump_printf_loc (MSG_NOTE, vect_location,
10070 : "vect_model_load_cost: inside_cost = %d, "
10071 : "prologue_cost = %d .\n",
10072 : inside_cost, prologue_cost);
10073 3289 : SLP_TREE_TYPE (slp_node) = load_vec_info_type;
10074 3289 : slp_node->data = new vect_load_store_data (std::move (ls));
10075 3289 : return true;
10076 : }
10077 835 : if (hoist_p)
10078 : {
10079 : /* ??? For non-uniform lanes there could be still duplicates.
10080 : We're leaving those to post-vectorizer CSE for the moment. */
10081 638 : auto_vec<tree> scalar_defs (SLP_TREE_LANES (slp_node));
10082 2055 : for (stmt_vec_info sinfo : SLP_TREE_SCALAR_STMTS (slp_node))
10083 : {
10084 728 : gassign *stmt = as_a <gassign *> (sinfo->stmt);
10085 728 : if (dump_enabled_p ())
10086 352 : dump_printf_loc (MSG_NOTE, vect_location,
10087 : "hoisting out of the vectorized loop: %G",
10088 : (gimple *) stmt);
10089 728 : scalar_dest = copy_ssa_name (gimple_assign_lhs (stmt));
10090 728 : tree rhs = unshare_expr (gimple_assign_rhs1 (stmt));
10091 728 : edge pe = loop_preheader_edge (loop);
10092 728 : gphi *vphi = get_virtual_phi (loop->header);
10093 728 : tree vuse;
10094 728 : if (vphi)
10095 722 : vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
10096 : else
10097 6 : vuse = gimple_vuse (gsi_stmt (*gsi));
10098 728 : gimple *new_stmt = gimple_build_assign (scalar_dest, rhs);
10099 728 : gimple_set_vuse (new_stmt, vuse);
10100 728 : gsi_insert_on_edge_immediate (pe, new_stmt);
10101 728 : hoist_defs_of_uses (new_stmt, loop, true);
10102 728 : if (!useless_type_conversion_p (TREE_TYPE (vectype),
10103 728 : TREE_TYPE (scalar_dest)))
10104 : {
10105 12 : tree tem = make_ssa_name (TREE_TYPE (vectype));
10106 12 : new_stmt = gimple_build_assign (tem,
10107 : NOP_EXPR, scalar_dest);
10108 12 : gsi_insert_on_edge_immediate (pe, new_stmt);
10109 12 : scalar_dest = tem;
10110 : }
10111 728 : scalar_defs.quick_push (scalar_dest);
10112 728 : if (uniform_p)
10113 : break;
10114 : }
10115 638 : if (!uniform_p)
10116 : {
10117 51 : unsigned const_nunits
10118 51 : = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
10119 116 : for (j = 0; j < (int) vec_num; ++j)
10120 : {
10121 65 : vec<constructor_elt, va_gc> *v = NULL;
10122 65 : vec_safe_reserve (v, const_nunits, true);
10123 369 : for (unsigned i = 0; i < const_nunits; ++i)
10124 : {
10125 304 : unsigned def_idx
10126 304 : = (j * const_nunits + i) % SLP_TREE_LANES (slp_node);
10127 304 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
10128 : scalar_defs[def_idx]);
10129 : }
10130 65 : scalar_dest = build_constructor (vectype, v);
10131 65 : new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
10132 : vectype, NULL);
10133 65 : slp_node->push_vec_def (new_temp);
10134 : }
10135 51 : return true;
10136 : }
10137 587 : new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
10138 : vectype, NULL);
10139 638 : }
10140 : else
10141 : {
10142 197 : gcc_assert (uniform_p);
10143 197 : gimple_stmt_iterator gsi2 = *gsi;
10144 197 : gsi_next (&gsi2);
10145 197 : new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
10146 : vectype, &gsi2);
10147 : }
10148 1642 : for (j = 0; j < (int) vec_num; ++j)
10149 858 : slp_node->push_vec_def (new_temp);
10150 : return true;
10151 : }
10152 :
10153 597386 : if (memory_access_type == VMAT_ELEMENTWISE
10154 597386 : || memory_access_type == VMAT_STRIDED_SLP)
10155 : {
10156 23519 : gimple_stmt_iterator incr_gsi;
10157 23519 : bool insert_after;
10158 23519 : tree offvar = NULL_TREE;
10159 23519 : tree ivstep;
10160 23519 : tree running_off;
10161 23519 : vec<constructor_elt, va_gc> *v = NULL;
10162 23519 : tree stride_base, stride_step = NULL_TREE, alias_off;
10163 : /* Checked by get_load_store_type. */
10164 23519 : unsigned int const_nunits = nunits.to_constant ();
10165 23519 : unsigned HOST_WIDE_INT cst_offset = 0;
10166 23519 : tree dr_offset;
10167 23519 : unsigned int inside_cost = 0;
10168 :
10169 23519 : gcc_assert (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
10170 23519 : gcc_assert (!nested_in_vect_loop);
10171 :
10172 23519 : if (grouped_load)
10173 : {
10174 : /* If we elided a consecutive load permutation, don't
10175 : use the original first statement (which could be elided)
10176 : but the one the load permutation starts with.
10177 : This ensures the stride_base below is correct. */
10178 10734 : if (!ls.subchain_p)
10179 10690 : first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
10180 : else
10181 44 : first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
10182 10734 : first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
10183 10734 : ref_type = get_group_alias_ptr_type (first_stmt_info);
10184 : }
10185 : else
10186 : {
10187 12785 : first_stmt_info = stmt_info;
10188 12785 : first_dr_info = dr_info;
10189 12785 : ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr));
10190 : }
10191 :
10192 23519 : if (grouped_load)
10193 : {
10194 10734 : if (memory_access_type == VMAT_STRIDED_SLP)
10195 : {
10196 : /* If we elided a consecutive load permutation, adjust
10197 : the group size here. */
10198 4217 : if (!ls.subchain_p)
10199 4173 : group_size = DR_GROUP_SIZE (first_stmt_info);
10200 : else
10201 44 : group_size = SLP_TREE_LANES (slp_node);
10202 : }
10203 : else /* VMAT_ELEMENTWISE */
10204 6517 : group_size = SLP_TREE_LANES (slp_node);
10205 : }
10206 : else
10207 : group_size = 1;
10208 :
10209 23519 : if (!costing_p)
10210 : {
10211 3430 : dr_offset = get_dr_vinfo_offset (vinfo, first_dr_info);
10212 3430 : stride_base = fold_build_pointer_plus (
10213 : DR_BASE_ADDRESS (first_dr_info->dr),
10214 : size_binop (PLUS_EXPR, convert_to_ptrofftype (dr_offset),
10215 : convert_to_ptrofftype (DR_INIT (first_dr_info->dr))));
10216 3430 : stride_step = fold_convert (sizetype, DR_STEP (first_dr_info->dr));
10217 :
10218 : /* For a load with loop-invariant (but other than power-of-2)
10219 : stride (i.e. not a grouped access) like so:
10220 :
10221 : for (i = 0; i < n; i += stride)
10222 : ... = array[i];
10223 :
10224 : we generate a new induction variable and new accesses to
10225 : form a new vector (or vectors, depending on ncopies):
10226 :
10227 : for (j = 0; ; j += VF*stride)
10228 : tmp1 = array[j];
10229 : tmp2 = array[j + stride];
10230 : ...
10231 : vectemp = {tmp1, tmp2, ...}
10232 : */
10233 :
10234 3430 : ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (stride_step), stride_step,
10235 : build_int_cst (TREE_TYPE (stride_step), vf));
10236 :
10237 3430 : standard_iv_increment_position (loop, &incr_gsi, &insert_after);
10238 :
10239 3430 : stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
10240 3430 : ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
10241 3430 : create_iv (stride_base, PLUS_EXPR, ivstep, NULL,
10242 : loop, &incr_gsi, insert_after,
10243 : &offvar, NULL);
10244 :
10245 3430 : stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
10246 : }
10247 :
10248 23519 : running_off = offvar;
10249 23519 : alias_off = build_int_cst (ref_type, 0);
10250 23519 : int nloads = const_nunits;
10251 23519 : int lnel = 1;
10252 23519 : tree ltype = TREE_TYPE (vectype);
10253 23519 : tree lvectype = vectype;
10254 23519 : auto_vec<tree> dr_chain;
10255 : /* ??? Modify local copies of alignment_support_scheme and
10256 : misalignment, but this part of analysis should be done
10257 : earlier and remembered, likewise the chosen load mode. */
10258 23519 : const dr_alignment_support tem = alignment_support_scheme;
10259 23519 : dr_alignment_support alignment_support_scheme = tem;
10260 23519 : const int tem2 = misalignment;
10261 23519 : int misalignment = tem2;
10262 23519 : if (memory_access_type == VMAT_STRIDED_SLP)
10263 : {
10264 17002 : HOST_WIDE_INT n = gcd (group_size, const_nunits);
10265 : /* Use the target vector type if the group size is a multiple
10266 : of it. */
10267 17002 : if (n == const_nunits)
10268 : {
10269 2247 : int mis_align = dr_misalignment (first_dr_info, vectype);
10270 : /* With VF > 1 we advance the DR by step, if that is constant
10271 : and only aligned when performed VF times, DR alignment
10272 : analysis can analyze this as aligned since it assumes
10273 : contiguous accesses. But that is not how we code generate
10274 : here, so adjust for this. */
10275 2247 : if (maybe_gt (vf, 1u)
10276 3613 : && !multiple_p (DR_STEP_ALIGNMENT (first_dr_info->dr),
10277 3401 : DR_TARGET_ALIGNMENT (first_dr_info)))
10278 212 : mis_align = -1;
10279 2247 : dr_alignment_support dr_align
10280 2247 : = vect_supportable_dr_alignment (vinfo, dr_info, vectype,
10281 : mis_align);
10282 2247 : if (dr_align == dr_aligned
10283 2247 : || dr_align == dr_unaligned_supported)
10284 : {
10285 17002 : nloads = 1;
10286 17002 : lnel = const_nunits;
10287 17002 : ltype = vectype;
10288 17002 : alignment_support_scheme = dr_align;
10289 17002 : misalignment = mis_align;
10290 : }
10291 : }
10292 : /* Else use the biggest vector we can load the group without
10293 : accessing excess elements. */
10294 14755 : else if (n > 1)
10295 : {
10296 1965 : tree ptype;
10297 1965 : tree vtype
10298 1965 : = vector_vector_composition_type (vectype, const_nunits / n,
10299 : &ptype);
10300 1965 : if (vtype != NULL_TREE)
10301 : {
10302 1927 : dr_alignment_support dr_align;
10303 1927 : int mis_align = 0;
10304 1927 : if (VECTOR_TYPE_P (ptype))
10305 : {
10306 1005 : mis_align = dr_misalignment (first_dr_info, ptype);
10307 1005 : if (maybe_gt (vf, 1u)
10308 1980 : && !multiple_p (DR_STEP_ALIGNMENT (first_dr_info->dr),
10309 1011 : DR_TARGET_ALIGNMENT (first_dr_info)))
10310 969 : mis_align = -1;
10311 1005 : dr_align
10312 1005 : = vect_supportable_dr_alignment (vinfo, dr_info, ptype,
10313 : mis_align);
10314 : }
10315 : else
10316 : dr_align = dr_unaligned_supported;
10317 1927 : if (dr_align == dr_aligned
10318 1927 : || dr_align == dr_unaligned_supported)
10319 : {
10320 1927 : nloads = const_nunits / n;
10321 1927 : lnel = n;
10322 1927 : lvectype = vtype;
10323 1927 : ltype = ptype;
10324 1927 : alignment_support_scheme = dr_align;
10325 1927 : misalignment = mis_align;
10326 : }
10327 : }
10328 : }
10329 17002 : unsigned align;
10330 17002 : if (alignment_support_scheme == dr_aligned)
10331 20 : align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
10332 : else
10333 16982 : align = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
10334 : /* Alignment is at most the access size if we do multiple loads. */
10335 17002 : if (nloads > 1)
10336 14755 : align = MIN (tree_to_uhwi (TYPE_SIZE_UNIT (ltype)), align);
10337 17002 : ltype = build_aligned_type (ltype, align * BITS_PER_UNIT);
10338 : }
10339 :
10340 23519 : if (costing_p)
10341 : {
10342 : /* Record the composition type for target access during costing. */
10343 20089 : ls.ls_type = lvectype;
10344 20089 : ls.ls_eltype = ltype;
10345 : }
10346 : else
10347 3430 : gcc_assert (ls.ls_type == lvectype && ls.ls_eltype == ltype);
10348 :
10349 : /* For SLP permutation support we need to load the whole group,
10350 : not only the number of vector stmts the permutation result
10351 : fits in. */
10352 23519 : int ncopies;
10353 23519 : if (ls.slp_perm)
10354 : {
10355 2869 : gcc_assert (memory_access_type != VMAT_ELEMENTWISE);
10356 : /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
10357 : variable VF. */
10358 2869 : unsigned int const_vf = vf.to_constant ();
10359 2869 : ncopies = CEIL (group_size * const_vf, const_nunits);
10360 2869 : dr_chain.create (ncopies);
10361 : }
10362 : else
10363 : ncopies = vec_num;
10364 :
10365 23519 : unsigned int group_el = 0;
10366 23519 : unsigned HOST_WIDE_INT
10367 23519 : elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
10368 23519 : unsigned int n_groups = 0;
10369 : /* For costing some adjacent vector loads, we'd like to cost with
10370 : the total number of them once instead of cost each one by one. */
10371 23519 : unsigned int n_adjacent_loads = 0;
10372 56276 : for (j = 0; j < ncopies; j++)
10373 : {
10374 32757 : if (nloads > 1 && !costing_p)
10375 3145 : vec_alloc (v, nloads);
10376 : gimple *new_stmt = NULL;
10377 137922 : for (i = 0; i < nloads; i++)
10378 : {
10379 105165 : if (costing_p)
10380 : {
10381 : /* For VMAT_ELEMENTWISE, just cost it as scalar_load to
10382 : avoid ICE, see PR110776. */
10383 95086 : if (VECTOR_TYPE_P (ltype)
10384 5822 : && memory_access_type != VMAT_ELEMENTWISE)
10385 5822 : n_adjacent_loads++;
10386 : else
10387 89264 : inside_cost += record_stmt_cost (cost_vec, 1, scalar_load,
10388 : slp_node, 0, vect_body);
10389 95086 : continue;
10390 : }
10391 10079 : unsigned int load_el = group_el;
10392 : /* For elementwise accesses apply a load permutation directly. */
10393 10079 : if (memory_access_type == VMAT_ELEMENTWISE
10394 10079 : && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
10395 2014 : load_el = SLP_TREE_LOAD_PERMUTATION (slp_node)[group_el];
10396 10079 : tree this_off = build_int_cst (TREE_TYPE (alias_off),
10397 10079 : load_el * elsz + cst_offset);
10398 10079 : tree data_ref = build2 (MEM_REF, ltype, running_off, this_off);
10399 10079 : vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
10400 10079 : new_temp = make_ssa_name (ltype);
10401 10079 : new_stmt = gimple_build_assign (new_temp, data_ref);
10402 10079 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
10403 10079 : if (nloads > 1)
10404 8472 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_temp);
10405 :
10406 10079 : group_el += lnel;
10407 10079 : if (group_el == group_size)
10408 : {
10409 9732 : n_groups++;
10410 : /* When doing SLP make sure to not load elements from
10411 : the next vector iteration, those will not be accessed
10412 : so just use the last element again. See PR107451. */
10413 9732 : if (known_lt (n_groups, vf))
10414 : {
10415 6282 : tree newoff = copy_ssa_name (running_off);
10416 6282 : gimple *incr
10417 6282 : = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
10418 : running_off, stride_step);
10419 6282 : vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
10420 6282 : running_off = newoff;
10421 : }
10422 : group_el = 0;
10423 : }
10424 : }
10425 :
10426 32757 : if (nloads > 1)
10427 : {
10428 24020 : if (costing_p)
10429 20875 : inside_cost += record_stmt_cost (cost_vec, 1, vec_construct,
10430 : slp_node, 0, vect_body);
10431 : else
10432 : {
10433 3145 : tree vec_inv = build_constructor (lvectype, v);
10434 3145 : new_temp = vect_init_vector (vinfo, stmt_info, vec_inv,
10435 : lvectype, gsi);
10436 3145 : new_stmt = SSA_NAME_DEF_STMT (new_temp);
10437 3145 : if (lvectype != vectype)
10438 : {
10439 398 : new_stmt
10440 398 : = gimple_build_assign (make_ssa_name (vectype),
10441 : VIEW_CONVERT_EXPR,
10442 : build1 (VIEW_CONVERT_EXPR,
10443 : vectype, new_temp));
10444 398 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
10445 : gsi);
10446 : }
10447 : }
10448 : }
10449 8737 : else if (!costing_p && ltype != vectype)
10450 : {
10451 1588 : new_stmt = gimple_build_assign (make_ssa_name (vectype),
10452 : VIEW_CONVERT_EXPR,
10453 : build1 (VIEW_CONVERT_EXPR,
10454 : vectype, new_temp));
10455 1588 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
10456 : gsi);
10457 : }
10458 :
10459 32757 : if (!costing_p)
10460 : {
10461 4752 : if (ls.slp_perm)
10462 1682 : dr_chain.quick_push (gimple_assign_lhs (new_stmt));
10463 : else
10464 3070 : slp_node->push_vec_def (new_stmt);
10465 : }
10466 : }
10467 23519 : if (ls.slp_perm)
10468 : {
10469 2869 : if (costing_p)
10470 : {
10471 2076 : gcc_assert (ls.n_perms != -1U);
10472 2076 : inside_cost += record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
10473 : slp_node, 0, vect_body);
10474 : }
10475 : else
10476 : {
10477 793 : unsigned n_perms2;
10478 793 : vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
10479 : false, &n_perms2);
10480 793 : gcc_assert (ls.n_perms == n_perms2);
10481 : }
10482 : }
10483 :
10484 23519 : if (costing_p)
10485 : {
10486 20089 : if (n_adjacent_loads > 0)
10487 2152 : vect_get_load_cost (vinfo, stmt_info, slp_node, n_adjacent_loads,
10488 : alignment_support_scheme, misalignment, false,
10489 : &inside_cost, nullptr, cost_vec, cost_vec,
10490 : true);
10491 20089 : if (dump_enabled_p ())
10492 498 : dump_printf_loc (MSG_NOTE, vect_location,
10493 : "vect_model_load_cost: inside_cost = %u, "
10494 : "prologue_cost = 0 .\n",
10495 : inside_cost);
10496 20089 : SLP_TREE_TYPE (slp_node) = load_vec_info_type;
10497 20089 : slp_node->data = new vect_load_store_data (std::move (ls));
10498 : }
10499 :
10500 23519 : return true;
10501 23519 : }
10502 :
10503 573867 : if (mat_gather_scatter_p (memory_access_type)
10504 573867 : && !ls.ls_type)
10505 : grouped_load = false;
10506 :
10507 570897 : if (grouped_load
10508 573867 : || SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
10509 : {
10510 261625 : if (grouped_load)
10511 : {
10512 261183 : first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
10513 261183 : group_size = DR_GROUP_SIZE (first_stmt_info);
10514 : }
10515 : else
10516 : {
10517 : first_stmt_info = stmt_info;
10518 : group_size = 1;
10519 : }
10520 : /* For SLP vectorization we directly vectorize a subchain
10521 : without permutation. */
10522 261625 : if (! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
10523 208631 : first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
10524 : /* For BB vectorization always use the first stmt to base
10525 : the data ref pointer on. */
10526 261625 : if (bb_vinfo)
10527 207027 : first_stmt_info_for_drptr
10528 207027 : = vect_find_first_scalar_stmt_in_slp (slp_node);
10529 :
10530 261625 : first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
10531 261625 : group_gap_adj = 0;
10532 :
10533 : /* VEC_NUM is the number of vect stmts to be created for this group. */
10534 261625 : grouped_load = false;
10535 : /* If an SLP permutation is from N elements to N elements,
10536 : and if one vector holds a whole number of N, we can load
10537 : the inputs to the permutation in the same way as an
10538 : unpermuted sequence. In other cases we need to load the
10539 : whole group, not only the number of vector stmts the
10540 : permutation result fits in. */
10541 261625 : unsigned scalar_lanes = SLP_TREE_LANES (slp_node);
10542 261625 : if (nested_in_vect_loop)
10543 : /* We do not support grouped accesses in a nested loop,
10544 : instead the access is contiguous but it might be
10545 : permuted. No gap adjustment is needed though. */
10546 : ;
10547 261623 : else if (ls.slp_perm
10548 261623 : && (group_size != scalar_lanes
10549 11204 : || !multiple_p (nunits, group_size)))
10550 : {
10551 : /* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for
10552 : variable VF; see vect_transform_slp_perm_load. */
10553 42966 : unsigned int const_vf = vf.to_constant ();
10554 42966 : unsigned int const_nunits = nunits.to_constant ();
10555 42966 : vec_num = CEIL (group_size * const_vf, const_nunits);
10556 42966 : group_gap_adj = vf * group_size - nunits * vec_num;
10557 : }
10558 : else
10559 : {
10560 218657 : group_gap_adj = group_size - scalar_lanes;
10561 : }
10562 :
10563 261625 : ref_type = get_group_alias_ptr_type (first_stmt_info);
10564 : }
10565 : else
10566 : {
10567 312242 : first_stmt_info = stmt_info;
10568 312242 : first_dr_info = dr_info;
10569 312242 : group_size = 1;
10570 312242 : group_gap_adj = 0;
10571 312242 : ref_type = reference_alias_ptr_type (DR_REF (first_dr_info->dr));
10572 : }
10573 :
10574 573867 : vec_loop_masks *loop_masks
10575 366840 : = (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10576 573867 : ? &LOOP_VINFO_MASKS (loop_vinfo)
10577 31 : : NULL);
10578 31 : vec_loop_lens *loop_lens
10579 366840 : = (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
10580 : ? &LOOP_VINFO_LENS (loop_vinfo)
10581 0 : : NULL);
10582 :
10583 : /* The vect_transform_stmt and vect_analyze_stmt will go here but there
10584 : are some difference here. We cannot enable both the lens and masks
10585 : during transform but it is allowed during analysis.
10586 : Shouldn't go with length-based approach if fully masked. */
10587 573867 : if (cost_vec == NULL)
10588 : /* The cost_vec is NULL during transform. */
10589 162412 : gcc_assert ((!loop_lens || !loop_masks));
10590 :
10591 : /* Targets with store-lane instructions must not require explicit
10592 : realignment. vect_supportable_dr_alignment always returns either
10593 : dr_aligned or dr_unaligned_supported for (non-length) masked
10594 : operations. */
10595 573867 : gcc_assert ((memory_access_type != VMAT_LOAD_STORE_LANES
10596 : && !mask_node
10597 : && !loop_masks)
10598 : || mat_gather_scatter_p (memory_access_type)
10599 : || alignment_support_scheme == dr_aligned
10600 : || alignment_support_scheme == dr_unaligned_supported);
10601 :
10602 : /* In case the vectorization factor (VF) is bigger than the number
10603 : of elements that we can fit in a vectype (nunits), we have to generate
10604 : more than one vector stmt - i.e - we need to "unroll" the
10605 : vector stmt by a factor VF/nunits. In doing so, we record a pointer
10606 : from one copy of the vector stmt to the next, in the field
10607 : STMT_VINFO_RELATED_STMT. This is necessary in order to allow following
10608 : stages to find the correct vector defs to be used when vectorizing
10609 : stmts that use the defs of the current stmt. The example below
10610 : illustrates the vectorization process when VF=16 and nunits=4 (i.e., we
10611 : need to create 4 vectorized stmts):
10612 :
10613 : before vectorization:
10614 : RELATED_STMT VEC_STMT
10615 : S1: x = memref - -
10616 : S2: z = x + 1 - -
10617 :
10618 : step 1: vectorize stmt S1:
10619 : We first create the vector stmt VS1_0, and, as usual, record a
10620 : pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1.
10621 : Next, we create the vector stmt VS1_1, and record a pointer to
10622 : it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0.
10623 : Similarly, for VS1_2 and VS1_3. This is the resulting chain of
10624 : stmts and pointers:
10625 : RELATED_STMT VEC_STMT
10626 : VS1_0: vx0 = memref0 VS1_1 -
10627 : VS1_1: vx1 = memref1 VS1_2 -
10628 : VS1_2: vx2 = memref2 VS1_3 -
10629 : VS1_3: vx3 = memref3 - -
10630 : S1: x = load - VS1_0
10631 : S2: z = x + 1 - -
10632 : */
10633 :
10634 : /* If the data reference is aligned (dr_aligned) or potentially unaligned
10635 : on a target that supports unaligned accesses (dr_unaligned_supported)
10636 : we generate the following code:
10637 : p = initial_addr;
10638 : indx = 0;
10639 : loop {
10640 : p = p + indx * vectype_size;
10641 : vec_dest = *(p);
10642 : indx = indx + 1;
10643 : }
10644 :
10645 : Otherwise, the data reference is potentially unaligned on a target that
10646 : does not support unaligned accesses (dr_explicit_realign_optimized) -
10647 : then generate the following code, in which the data in each iteration is
10648 : obtained by two vector loads, one from the previous iteration, and one
10649 : from the current iteration:
10650 : p1 = initial_addr;
10651 : msq_init = *(floor(p1))
10652 : p2 = initial_addr + VS - 1;
10653 : realignment_token = call target_builtin;
10654 : indx = 0;
10655 : loop {
10656 : p2 = p2 + indx * vectype_size
10657 : lsq = *(floor(p2))
10658 : vec_dest = realign_load (msq, lsq, realignment_token)
10659 : indx = indx + 1;
10660 : msq = lsq;
10661 : } */
10662 :
10663 : /* If the misalignment remains the same throughout the execution of the
10664 : loop, we can create the init_addr and permutation mask at the loop
10665 : preheader. Otherwise, it needs to be created inside the loop.
10666 : This can only occur when vectorizing memory accesses in the inner-loop
10667 : nested within an outer-loop that is being vectorized. */
10668 :
10669 573867 : if (nested_in_vect_loop
10670 573867 : && !multiple_p (DR_STEP_ALIGNMENT (dr_info->dr),
10671 1234 : GET_MODE_SIZE (TYPE_MODE (vectype))))
10672 : {
10673 195 : gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized);
10674 : compute_in_loop = true;
10675 : }
10676 :
10677 573867 : bool diff_first_stmt_info
10678 573867 : = first_stmt_info_for_drptr && first_stmt_info != first_stmt_info_for_drptr;
10679 :
10680 573867 : tree offset = NULL_TREE;
10681 573867 : if ((alignment_support_scheme == dr_explicit_realign_optimized
10682 573867 : || alignment_support_scheme == dr_explicit_realign)
10683 0 : && !compute_in_loop)
10684 : {
10685 : /* If we have different first_stmt_info, we can't set up realignment
10686 : here, since we can't guarantee first_stmt_info DR has been
10687 : initialized yet, use first_stmt_info_for_drptr DR by bumping the
10688 : distance from first_stmt_info DR instead as below. */
10689 0 : if (!costing_p)
10690 : {
10691 0 : if (!diff_first_stmt_info)
10692 0 : msq = vect_setup_realignment (vinfo, first_stmt_info, vectype, gsi,
10693 : &realignment_token,
10694 : alignment_support_scheme, NULL_TREE,
10695 : &at_loop);
10696 0 : if (alignment_support_scheme == dr_explicit_realign_optimized)
10697 : {
10698 0 : phi = as_a<gphi *> (SSA_NAME_DEF_STMT (msq));
10699 0 : offset = size_binop (MINUS_EXPR, TYPE_SIZE_UNIT (vectype),
10700 : size_one_node);
10701 0 : gcc_assert (!first_stmt_info_for_drptr);
10702 : }
10703 : }
10704 : }
10705 : else
10706 573867 : at_loop = loop;
10707 :
10708 573867 : if (!known_eq (poffset, 0))
10709 4626 : offset = (offset
10710 4626 : ? size_binop (PLUS_EXPR, offset, size_int (poffset))
10711 4626 : : size_int (poffset));
10712 :
10713 573867 : tree bump;
10714 573867 : tree vec_offset = NULL_TREE;
10715 :
10716 573867 : auto_vec<tree> vec_offsets;
10717 573867 : auto_vec<tree> vec_masks;
10718 573867 : if (mask_node && !costing_p)
10719 636 : vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[mask_index],
10720 : &vec_masks);
10721 :
10722 573867 : tree vec_mask = NULL_TREE;
10723 573867 : tree vec_els = NULL_TREE;
10724 573867 : if (memory_access_type == VMAT_LOAD_STORE_LANES)
10725 : {
10726 0 : const internal_fn lanes_ifn = ls.lanes_ifn;
10727 :
10728 0 : gcc_assert (alignment_support_scheme == dr_aligned
10729 : || alignment_support_scheme == dr_unaligned_supported);
10730 :
10731 0 : aggr_type = build_array_type_nelts (elem_type, group_size * nunits);
10732 0 : if (!costing_p)
10733 0 : bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
10734 : memory_access_type, loop_lens);
10735 :
10736 0 : unsigned int inside_cost = 0, prologue_cost = 0;
10737 : /* For costing some adjacent vector loads, we'd like to cost with
10738 : the total number of them once instead of cost each one by one. */
10739 0 : unsigned int n_adjacent_loads = 0;
10740 0 : int ncopies = vec_num / group_size;
10741 0 : for (j = 0; j < ncopies; j++)
10742 : {
10743 0 : if (costing_p)
10744 : {
10745 : /* An IFN_LOAD_LANES will load all its vector results,
10746 : regardless of which ones we actually need. Account
10747 : for the cost of unused results. */
10748 0 : if (first_stmt_info == stmt_info)
10749 : {
10750 0 : unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
10751 0 : stmt_vec_info next_stmt_info = first_stmt_info;
10752 0 : do
10753 : {
10754 0 : gaps -= 1;
10755 0 : next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
10756 : }
10757 0 : while (next_stmt_info);
10758 0 : if (gaps)
10759 : {
10760 0 : if (dump_enabled_p ())
10761 0 : dump_printf_loc (MSG_NOTE, vect_location,
10762 : "vect_model_load_cost: %d "
10763 : "unused vectors.\n",
10764 : gaps);
10765 0 : vect_get_load_cost (vinfo, stmt_info, slp_node, gaps,
10766 : alignment_support_scheme,
10767 : misalignment, false, &inside_cost,
10768 : &prologue_cost, cost_vec, cost_vec,
10769 : true);
10770 : }
10771 : }
10772 0 : n_adjacent_loads++;
10773 0 : continue;
10774 0 : }
10775 :
10776 : /* 1. Create the vector or array pointer update chain. */
10777 0 : if (j == 0)
10778 0 : dataref_ptr
10779 0 : = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
10780 : at_loop, offset, &dummy, gsi,
10781 : &ptr_incr, false, bump);
10782 : else
10783 : {
10784 0 : gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10785 0 : dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
10786 : stmt_info, bump);
10787 : }
10788 0 : if (mask_node)
10789 0 : vec_mask = vec_masks[j];
10790 :
10791 0 : tree vec_array = create_vector_array (vectype, group_size);
10792 :
10793 0 : tree final_mask = NULL_TREE;
10794 0 : tree final_len = NULL_TREE;
10795 0 : tree bias = NULL_TREE;
10796 0 : if (loop_masks)
10797 0 : final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
10798 : ncopies, vectype, j);
10799 0 : if (vec_mask)
10800 0 : final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
10801 : vec_mask, gsi);
10802 :
10803 0 : if (lanes_ifn == IFN_MASK_LEN_LOAD_LANES)
10804 : {
10805 0 : if (loop_lens)
10806 0 : final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
10807 : ncopies, vectype, j, 1, true);
10808 : else
10809 0 : final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
10810 0 : signed char biasval
10811 0 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10812 0 : bias = build_int_cst (intQI_type_node, biasval);
10813 0 : if (!final_mask)
10814 : {
10815 0 : mask_vectype = truth_type_for (vectype);
10816 0 : final_mask = build_minus_one_cst (mask_vectype);
10817 : }
10818 : }
10819 :
10820 0 : if (final_mask)
10821 : {
10822 0 : vec_els = vect_get_mask_load_else (maskload_elsval, vectype);
10823 0 : if (type_mode_padding_p
10824 0 : && maskload_elsval != MASK_LOAD_ELSE_ZERO)
10825 0 : need_zeroing = true;
10826 : }
10827 :
10828 0 : gcall *call;
10829 0 : if (final_len && final_mask)
10830 : {
10831 : /* Emit:
10832 : VEC_ARRAY = MASK_LEN_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
10833 : VEC_MASK, LEN, BIAS). */
10834 0 : unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
10835 0 : tree alias_ptr = build_int_cst (ref_type, align);
10836 0 : call = gimple_build_call_internal (IFN_MASK_LEN_LOAD_LANES, 6,
10837 : dataref_ptr, alias_ptr,
10838 : final_mask, vec_els,
10839 : final_len, bias);
10840 : }
10841 0 : else if (final_mask)
10842 : {
10843 : /* Emit:
10844 : VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
10845 : VEC_MASK). */
10846 0 : unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
10847 0 : tree alias_ptr = build_int_cst (ref_type, align);
10848 0 : call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 4,
10849 : dataref_ptr, alias_ptr,
10850 : final_mask, vec_els);
10851 : }
10852 : else
10853 : {
10854 : /* Emit:
10855 : VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]). */
10856 0 : data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
10857 0 : call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
10858 : }
10859 0 : gimple_call_set_lhs (call, vec_array);
10860 0 : gimple_call_set_nothrow (call, true);
10861 0 : vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
10862 :
10863 : /* Extract each vector into an SSA_NAME. */
10864 0 : for (unsigned i = 0; i < group_size; i++)
10865 : {
10866 0 : new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
10867 : vec_array, i, need_zeroing,
10868 : final_mask);
10869 0 : slp_node->push_vec_def (new_temp);
10870 : }
10871 :
10872 : /* Record that VEC_ARRAY is now dead. */
10873 0 : vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
10874 : }
10875 :
10876 0 : if (costing_p)
10877 : {
10878 0 : if (n_adjacent_loads > 0)
10879 0 : vect_get_load_cost (vinfo, stmt_info, slp_node, n_adjacent_loads,
10880 : alignment_support_scheme, misalignment, false,
10881 : &inside_cost, &prologue_cost, cost_vec,
10882 : cost_vec, true);
10883 0 : if (dump_enabled_p ())
10884 0 : dump_printf_loc (MSG_NOTE, vect_location,
10885 : "vect_model_load_cost: inside_cost = %u, "
10886 : "prologue_cost = %u .\n",
10887 : inside_cost, prologue_cost);
10888 0 : SLP_TREE_TYPE (slp_node) = load_vec_info_type;
10889 0 : slp_node->data = new vect_load_store_data (std::move (ls));
10890 : }
10891 :
10892 0 : return true;
10893 : }
10894 :
10895 573867 : if (mat_gather_scatter_p (memory_access_type))
10896 : {
10897 2970 : gcc_assert ((!grouped_load && !ls.slp_perm) || ls.ls_type);
10898 :
10899 2970 : auto_vec<tree> dr_chain (vec_num);
10900 :
10901 : /* If we pun the original vectype the loads as well as costing, length,
10902 : etc. is performed with the new type. After loading we VIEW_CONVERT
10903 : the data to the original vectype. */
10904 2970 : tree original_vectype = vectype;
10905 2970 : if (ls.ls_type)
10906 0 : vectype = ls.ls_type;
10907 :
10908 : /* 1. Create the vector or array pointer update chain. */
10909 2970 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
10910 : {
10911 2970 : aggr_type = NULL_TREE;
10912 2970 : bump = NULL_TREE;
10913 2970 : if (!costing_p)
10914 763 : vect_get_gather_scatter_ops (loop, slp_node, &dataref_ptr,
10915 : &vec_offsets);
10916 : }
10917 : else
10918 : {
10919 0 : aggr_type = elem_type;
10920 0 : if (!costing_p)
10921 : {
10922 0 : vect_get_strided_load_store_ops (stmt_info, slp_node, vectype,
10923 : ls.strided_offset_vectype,
10924 : loop_vinfo, gsi,
10925 : &bump, &vec_offset, loop_lens);
10926 0 : dataref_ptr
10927 0 : = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
10928 : at_loop, offset, &dummy, gsi,
10929 : &ptr_incr, false, bump);
10930 : }
10931 : }
10932 :
10933 : unsigned int inside_cost = 0, prologue_cost = 0;
10934 :
10935 6713 : gimple *new_stmt = NULL;
10936 6713 : for (i = 0; i < vec_num; i++)
10937 : {
10938 3743 : tree final_mask = NULL_TREE;
10939 3743 : tree final_len = NULL_TREE;
10940 3743 : tree bias = NULL_TREE;
10941 3743 : if (!costing_p)
10942 : {
10943 980 : if (mask_node)
10944 153 : vec_mask = vec_masks[i];
10945 980 : if (loop_masks)
10946 0 : final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
10947 : vec_num, vectype, i);
10948 980 : if (vec_mask)
10949 153 : final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
10950 : final_mask, vec_mask, gsi);
10951 :
10952 980 : if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
10953 0 : dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
10954 : gsi, stmt_info, bump);
10955 : }
10956 :
10957 : /* 2. Create the vector-load in the loop. */
10958 3743 : unsigned align = get_object_alignment (DR_REF (first_dr_info->dr));
10959 3743 : tree alias_align_ptr = build_int_cst (ref_type, align);
10960 3743 : if (memory_access_type == VMAT_GATHER_SCATTER_IFN)
10961 : {
10962 0 : if (costing_p)
10963 : {
10964 0 : if (ls.supported_offset_vectype
10965 0 : && !tree_nop_conversion_p (ls.supported_offset_vectype,
10966 : vec_offset))
10967 0 : inside_cost
10968 0 : += record_stmt_cost (cost_vec, 1, vector_stmt,
10969 : slp_node, 0, vect_body);
10970 0 : if (ls.supported_scale)
10971 0 : inside_cost
10972 0 : += record_stmt_cost (cost_vec, 1, vector_stmt,
10973 : slp_node, 0, vect_body);
10974 :
10975 0 : unsigned int cnunits = vect_nunits_for_cost (vectype);
10976 0 : inside_cost
10977 0 : = record_stmt_cost (cost_vec, cnunits, scalar_load,
10978 : slp_node, 0, vect_body);
10979 3743 : continue;
10980 0 : }
10981 0 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
10982 0 : vec_offset = vec_offsets[i];
10983 0 : tree zero = build_zero_cst (vectype);
10984 0 : tree scale = size_int (SLP_TREE_GS_SCALE (slp_node));
10985 0 : bool strided = !VECTOR_TYPE_P (TREE_TYPE (vec_offset));
10986 :
10987 : /* Perform the offset conversion and scaling if necessary. */
10988 0 : if (!strided
10989 0 : && (ls.supported_offset_vectype || ls.supported_scale))
10990 : {
10991 0 : gimple_seq stmts = NULL;
10992 0 : if (ls.supported_offset_vectype)
10993 0 : vec_offset = gimple_convert
10994 0 : (&stmts, ls.supported_offset_vectype, vec_offset);
10995 0 : if (ls.supported_scale)
10996 : {
10997 : /* Only scale the vec_offset if we haven't already. */
10998 0 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
10999 0 : || i == 0)
11000 : {
11001 0 : tree mult_cst = build_int_cst
11002 0 : (TREE_TYPE (TREE_TYPE (vec_offset)),
11003 0 : SLP_TREE_GS_SCALE (slp_node) / ls.supported_scale);
11004 0 : tree mult = build_vector_from_val
11005 0 : (TREE_TYPE (vec_offset), mult_cst);
11006 0 : vec_offset = gimple_build
11007 0 : (&stmts, MULT_EXPR, TREE_TYPE (vec_offset),
11008 : vec_offset, mult);
11009 : }
11010 0 : scale = size_int (ls.supported_scale);
11011 : }
11012 0 : gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
11013 : }
11014 :
11015 0 : if (ls.gs.ifn == IFN_MASK_LEN_GATHER_LOAD)
11016 : {
11017 0 : if (loop_lens)
11018 0 : final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
11019 : vec_num, vectype, i, 1, true);
11020 : else
11021 0 : final_len = build_int_cst (sizetype,
11022 0 : TYPE_VECTOR_SUBPARTS (vectype));
11023 0 : signed char biasval
11024 0 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
11025 0 : bias = build_int_cst (intQI_type_node, biasval);
11026 0 : if (!final_mask)
11027 : {
11028 0 : mask_vectype = truth_type_for (vectype);
11029 0 : final_mask = build_minus_one_cst (mask_vectype);
11030 : }
11031 : }
11032 :
11033 0 : if (final_mask)
11034 : {
11035 0 : vec_els = vect_get_mask_load_else (maskload_elsval, vectype);
11036 0 : if (type_mode_padding_p
11037 0 : && maskload_elsval != MASK_LOAD_ELSE_ZERO)
11038 0 : need_zeroing = true;
11039 : }
11040 :
11041 0 : gcall *call;
11042 0 : if (final_len && final_mask)
11043 : {
11044 0 : if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
11045 0 : call = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD,
11046 : 9, dataref_ptr,
11047 : alias_align_ptr,
11048 : vec_offset, scale, zero,
11049 : final_mask, vec_els,
11050 : final_len, bias);
11051 : else
11052 : /* Non-vector offset indicates that prefer to take
11053 : MASK_LEN_STRIDED_LOAD instead of the
11054 : MASK_LEN_GATHER_LOAD with direct stride arg. */
11055 0 : call = gimple_build_call_internal
11056 0 : (IFN_MASK_LEN_STRIDED_LOAD, 7, dataref_ptr,
11057 : vec_offset, zero, final_mask, vec_els, final_len,
11058 : bias);
11059 : }
11060 0 : else if (final_mask)
11061 0 : call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD,
11062 : 7, dataref_ptr,
11063 : alias_align_ptr,
11064 : vec_offset, scale,
11065 : zero, final_mask, vec_els);
11066 : else
11067 0 : call = gimple_build_call_internal (IFN_GATHER_LOAD, 5,
11068 : dataref_ptr,
11069 : alias_align_ptr,
11070 : vec_offset, scale, zero);
11071 0 : gimple_call_set_nothrow (call, true);
11072 0 : new_stmt = call;
11073 0 : data_ref = NULL_TREE;
11074 : }
11075 3743 : else if (memory_access_type == VMAT_GATHER_SCATTER_LEGACY)
11076 : {
11077 : /* The builtin decls path for gather is legacy, x86 only. */
11078 849 : gcc_assert (!final_len && nunits.is_constant ());
11079 849 : if (costing_p)
11080 : {
11081 566 : unsigned int cnunits = vect_nunits_for_cost (vectype);
11082 566 : inside_cost
11083 566 : = record_stmt_cost (cost_vec, cnunits, scalar_load,
11084 : slp_node, 0, vect_body);
11085 566 : continue;
11086 566 : }
11087 283 : tree offset_vectype = TREE_TYPE (vec_offsets[0]);
11088 283 : poly_uint64 offset_nunits = TYPE_VECTOR_SUBPARTS (offset_vectype);
11089 283 : if (known_eq (nunits, offset_nunits))
11090 : {
11091 134 : new_stmt = vect_build_one_gather_load_call
11092 134 : (vinfo, stmt_info, slp_node, vectype, gsi,
11093 134 : ls.gs.decl, dataref_ptr, vec_offsets[i],
11094 : final_mask);
11095 134 : data_ref = NULL_TREE;
11096 : }
11097 149 : else if (known_eq (nunits, offset_nunits * 2))
11098 : {
11099 : /* We have a offset vector with half the number of
11100 : lanes but the builtins will produce full vectype
11101 : data with just the lower lanes filled. */
11102 63 : new_stmt = vect_build_one_gather_load_call
11103 126 : (vinfo, stmt_info, slp_node, vectype, gsi,
11104 63 : ls.gs.decl, dataref_ptr, vec_offsets[2 * i],
11105 : final_mask);
11106 63 : tree low = make_ssa_name (vectype);
11107 63 : gimple_set_lhs (new_stmt, low);
11108 63 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11109 :
11110 : /* now put upper half of final_mask in final_mask low. */
11111 63 : if (final_mask
11112 63 : && !SCALAR_INT_MODE_P (TYPE_MODE (TREE_TYPE (final_mask))))
11113 : {
11114 11 : int count = nunits.to_constant ();
11115 11 : vec_perm_builder sel (count, count, 1);
11116 11 : sel.quick_grow (count);
11117 87 : for (int i = 0; i < count; ++i)
11118 76 : sel[i] = i | (count / 2);
11119 11 : vec_perm_indices indices (sel, 2, count);
11120 11 : tree perm_mask = vect_gen_perm_mask_checked
11121 11 : (TREE_TYPE (final_mask), indices);
11122 11 : new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
11123 : final_mask, final_mask,
11124 : perm_mask);
11125 11 : final_mask = make_ssa_name (TREE_TYPE (final_mask));
11126 11 : gimple_set_lhs (new_stmt, final_mask);
11127 11 : vect_finish_stmt_generation (vinfo, stmt_info,
11128 : new_stmt, gsi);
11129 11 : }
11130 52 : else if (final_mask)
11131 : {
11132 24 : new_stmt = gimple_build_assign (NULL_TREE,
11133 : VEC_UNPACK_HI_EXPR,
11134 : final_mask);
11135 24 : final_mask = make_ssa_name
11136 24 : (truth_type_for (offset_vectype));
11137 24 : gimple_set_lhs (new_stmt, final_mask);
11138 24 : vect_finish_stmt_generation (vinfo, stmt_info,
11139 : new_stmt, gsi);
11140 : }
11141 :
11142 63 : new_stmt = vect_build_one_gather_load_call
11143 126 : (vinfo, stmt_info, slp_node, vectype, gsi,
11144 : ls.gs.decl, dataref_ptr,
11145 63 : vec_offsets[2 * i + 1], final_mask);
11146 63 : tree high = make_ssa_name (vectype);
11147 63 : gimple_set_lhs (new_stmt, high);
11148 63 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11149 :
11150 : /* compose low + high. */
11151 63 : int count = nunits.to_constant ();
11152 63 : vec_perm_builder sel (count, count, 1);
11153 63 : sel.quick_grow (count);
11154 647 : for (int i = 0; i < count; ++i)
11155 584 : sel[i] = i < count / 2 ? i : i + count / 2;
11156 63 : vec_perm_indices indices (sel, 2, count);
11157 63 : tree perm_mask
11158 63 : = vect_gen_perm_mask_checked (vectype, indices);
11159 63 : new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
11160 : low, high, perm_mask);
11161 63 : data_ref = NULL_TREE;
11162 63 : }
11163 86 : else if (known_eq (nunits * 2, offset_nunits))
11164 : {
11165 : /* We have a offset vector with double the number of
11166 : lanes. Select the low/high part accordingly. */
11167 86 : vec_offset = vec_offsets[i / 2];
11168 86 : if (i & 1)
11169 : {
11170 43 : int count = offset_nunits.to_constant ();
11171 43 : vec_perm_builder sel (count, count, 1);
11172 43 : sel.quick_grow (count);
11173 463 : for (int i = 0; i < count; ++i)
11174 420 : sel[i] = i | (count / 2);
11175 43 : vec_perm_indices indices (sel, 2, count);
11176 43 : tree perm_mask = vect_gen_perm_mask_checked
11177 43 : (TREE_TYPE (vec_offset), indices);
11178 43 : new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
11179 : vec_offset, vec_offset,
11180 : perm_mask);
11181 43 : vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
11182 43 : gimple_set_lhs (new_stmt, vec_offset);
11183 43 : vect_finish_stmt_generation (vinfo, stmt_info,
11184 : new_stmt, gsi);
11185 43 : }
11186 86 : new_stmt = vect_build_one_gather_load_call
11187 86 : (vinfo, stmt_info, slp_node, vectype, gsi,
11188 : ls.gs.decl,
11189 : dataref_ptr, vec_offset, final_mask);
11190 86 : data_ref = NULL_TREE;
11191 : }
11192 : else
11193 0 : gcc_unreachable ();
11194 : }
11195 : else
11196 : {
11197 : /* Emulated gather-scatter. */
11198 2894 : gcc_assert (!final_mask);
11199 2894 : unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
11200 2894 : if (costing_p)
11201 : {
11202 : /* For emulated gathers N offset vector element
11203 : offset add is consumed by the load). */
11204 2197 : inside_cost = record_stmt_cost (cost_vec, 1, vec_deconstruct,
11205 : slp_node, 0, vect_body);
11206 : /* N scalar loads plus gathering them into a
11207 : vector. */
11208 2197 : inside_cost
11209 2197 : = record_stmt_cost (cost_vec, const_nunits, scalar_load,
11210 : slp_node, 0, vect_body);
11211 2197 : inside_cost
11212 2197 : = record_stmt_cost (cost_vec, 1, vec_construct,
11213 : slp_node, 0, vect_body);
11214 2197 : continue;
11215 : }
11216 697 : tree offset_vectype = TREE_TYPE (vec_offsets[0]);
11217 697 : unsigned HOST_WIDE_INT const_offset_nunits
11218 697 : = TYPE_VECTOR_SUBPARTS (offset_vectype).to_constant ();
11219 697 : vec<constructor_elt, va_gc> *ctor_elts;
11220 697 : vec_alloc (ctor_elts, const_nunits);
11221 697 : gimple_seq stmts = NULL;
11222 : /* We support offset vectors with more elements
11223 : than the data vector for now. */
11224 697 : unsigned HOST_WIDE_INT factor
11225 : = const_offset_nunits / const_nunits;
11226 697 : vec_offset = vec_offsets[i / factor];
11227 697 : unsigned elt_offset = (i % factor) * const_nunits;
11228 697 : tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
11229 697 : tree scale = size_int (SLP_TREE_GS_SCALE (slp_node));
11230 697 : tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
11231 2825 : for (unsigned k = 0; k < const_nunits; ++k)
11232 : {
11233 2128 : tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
11234 : bitsize_int (k + elt_offset));
11235 6384 : tree idx = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
11236 2128 : vec_offset, TYPE_SIZE (idx_type),
11237 : boff);
11238 2128 : idx = gimple_convert (&stmts, sizetype, idx);
11239 2128 : idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx, scale);
11240 2128 : tree ptr = gimple_build (&stmts, PLUS_EXPR,
11241 2128 : TREE_TYPE (dataref_ptr),
11242 : dataref_ptr, idx);
11243 2128 : ptr = gimple_convert (&stmts, ptr_type_node, ptr);
11244 2128 : tree elt = make_ssa_name (TREE_TYPE (vectype));
11245 2128 : tree ref = build2 (MEM_REF, ltype, ptr,
11246 : build_int_cst (ref_type, 0));
11247 2128 : new_stmt = gimple_build_assign (elt, ref);
11248 4256 : gimple_set_vuse (new_stmt, gimple_vuse (gsi_stmt (*gsi)));
11249 2128 : gimple_seq_add_stmt (&stmts, new_stmt);
11250 2128 : CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
11251 : }
11252 697 : gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
11253 697 : new_stmt = gimple_build_assign (NULL_TREE,
11254 : build_constructor (vectype,
11255 : ctor_elts));
11256 697 : data_ref = NULL_TREE;
11257 : }
11258 :
11259 980 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
11260 : /* DATA_REF is null if we've already built the statement. */
11261 980 : if (data_ref)
11262 : {
11263 : vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
11264 : new_stmt = gimple_build_assign (vec_dest, data_ref);
11265 : }
11266 1960 : new_temp = (need_zeroing
11267 980 : ? make_ssa_name (vectype)
11268 980 : : make_ssa_name (vec_dest, new_stmt));
11269 980 : gimple_set_lhs (new_stmt, new_temp);
11270 980 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11271 :
11272 : /* If we need to explicitly zero inactive elements emit a
11273 : VEC_COND_EXPR that does so. */
11274 980 : if (need_zeroing)
11275 : {
11276 0 : vec_els = vect_get_mask_load_else (MASK_LOAD_ELSE_ZERO,
11277 : vectype);
11278 :
11279 0 : tree new_temp2 = make_ssa_name (vec_dest, new_stmt);
11280 0 : new_stmt = gimple_build_assign (new_temp2, VEC_COND_EXPR,
11281 : final_mask, new_temp, vec_els);
11282 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11283 0 : new_temp = new_temp2;
11284 : }
11285 :
11286 980 : if (ls.ls_type)
11287 : {
11288 0 : new_stmt = gimple_build_assign (make_ssa_name
11289 : (original_vectype),
11290 : VIEW_CONVERT_EXPR,
11291 : build1 (VIEW_CONVERT_EXPR,
11292 : original_vectype,
11293 : new_temp));
11294 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11295 : }
11296 :
11297 : /* Store vector loads in the corresponding SLP_NODE. */
11298 980 : if (!costing_p)
11299 : {
11300 980 : if (ls.slp_perm)
11301 0 : dr_chain.quick_push (gimple_assign_lhs (new_stmt));
11302 : else
11303 980 : slp_node->push_vec_def (new_stmt);
11304 : }
11305 : }
11306 :
11307 2970 : if (ls.slp_perm)
11308 : {
11309 0 : if (costing_p)
11310 : {
11311 0 : gcc_assert (ls.n_perms != -1U);
11312 0 : inside_cost += record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
11313 : slp_node, 0, vect_body);
11314 : }
11315 : else
11316 : {
11317 0 : unsigned n_perms2;
11318 0 : vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
11319 : false, &n_perms2);
11320 0 : gcc_assert (ls.n_perms == n_perms2);
11321 : }
11322 : }
11323 :
11324 2970 : if (costing_p)
11325 : {
11326 2207 : if (dump_enabled_p ())
11327 315 : dump_printf_loc (MSG_NOTE, vect_location,
11328 : "vect_model_load_cost: inside_cost = %u, "
11329 : "prologue_cost = %u .\n",
11330 : inside_cost, prologue_cost);
11331 2207 : SLP_TREE_TYPE (slp_node) = load_vec_info_type;
11332 2207 : slp_node->data = new vect_load_store_data (std::move (ls));
11333 : }
11334 2970 : return true;
11335 2970 : }
11336 :
11337 570897 : aggr_type = vectype;
11338 570897 : if (!costing_p)
11339 161649 : bump = vect_get_data_ptr_increment (vinfo, gsi, dr_info, aggr_type,
11340 : memory_access_type, loop_lens);
11341 :
11342 570897 : poly_uint64 group_elt = 0;
11343 570897 : unsigned int inside_cost = 0, prologue_cost = 0;
11344 : /* For costing some adjacent vector loads, we'd like to cost with
11345 : the total number of them once instead of cost each one by one. */
11346 570897 : unsigned int n_adjacent_loads = 0;
11347 :
11348 : /* 1. Create the vector or array pointer update chain. */
11349 570897 : if (!costing_p)
11350 : {
11351 161649 : bool simd_lane_access_p
11352 161649 : = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) != 0;
11353 161649 : if (simd_lane_access_p
11354 1629 : && TREE_CODE (DR_BASE_ADDRESS (first_dr_info->dr)) == ADDR_EXPR
11355 1629 : && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (first_dr_info->dr), 0))
11356 1629 : && integer_zerop (get_dr_vinfo_offset (vinfo, first_dr_info))
11357 1629 : && integer_zerop (DR_INIT (first_dr_info->dr))
11358 1629 : && alias_sets_conflict_p (get_alias_set (aggr_type),
11359 1629 : get_alias_set (TREE_TYPE (ref_type)))
11360 161649 : && (alignment_support_scheme == dr_aligned
11361 1629 : || alignment_support_scheme == dr_unaligned_supported))
11362 : {
11363 1629 : dataref_ptr = unshare_expr (DR_BASE_ADDRESS (first_dr_info->dr));
11364 1629 : dataref_offset = build_int_cst (ref_type, 0);
11365 : }
11366 160020 : else if (diff_first_stmt_info)
11367 : {
11368 3564 : dataref_ptr
11369 3564 : = vect_create_data_ref_ptr (vinfo, first_stmt_info_for_drptr,
11370 : aggr_type, at_loop, offset, &dummy,
11371 : gsi, &ptr_incr, simd_lane_access_p,
11372 : bump);
11373 : /* Adjust the pointer by the difference to first_stmt. */
11374 3564 : data_reference_p ptrdr
11375 : = STMT_VINFO_DATA_REF (first_stmt_info_for_drptr);
11376 3564 : tree diff = fold_convert (sizetype,
11377 : size_binop (MINUS_EXPR,
11378 : DR_INIT (first_dr_info->dr),
11379 : DR_INIT (ptrdr)));
11380 3564 : dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
11381 : stmt_info, diff);
11382 3564 : if (alignment_support_scheme == dr_explicit_realign)
11383 : {
11384 0 : msq = vect_setup_realignment (vinfo, first_stmt_info_for_drptr,
11385 : vectype, gsi,
11386 : &realignment_token,
11387 : alignment_support_scheme,
11388 : dataref_ptr, &at_loop);
11389 0 : gcc_assert (!compute_in_loop);
11390 : }
11391 : }
11392 : else
11393 156456 : dataref_ptr
11394 156456 : = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
11395 : at_loop,
11396 : offset, &dummy, gsi, &ptr_incr,
11397 : simd_lane_access_p, bump);
11398 : }
11399 : else if (!costing_p)
11400 : {
11401 : gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
11402 : if (dataref_offset)
11403 : dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
11404 : else
11405 : dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
11406 : stmt_info, bump);
11407 : }
11408 :
11409 570897 : auto_vec<tree> dr_chain;
11410 570897 : if (grouped_load || ls.slp_perm)
11411 52994 : dr_chain.create (vec_num);
11412 :
11413 : gimple *new_stmt = NULL;
11414 1484289 : for (i = 0; i < vec_num; i++)
11415 : {
11416 913392 : tree final_mask = NULL_TREE;
11417 913392 : tree final_len = NULL_TREE;
11418 913392 : tree bias = NULL_TREE;
11419 :
11420 913392 : if (!costing_p)
11421 : {
11422 253908 : if (mask_node)
11423 709 : vec_mask = vec_masks[i];
11424 253908 : if (loop_masks)
11425 48 : final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
11426 : vec_num, vectype, i);
11427 253908 : if (vec_mask)
11428 709 : final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
11429 : final_mask, vec_mask, gsi);
11430 :
11431 253908 : if (i > 0)
11432 92259 : dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
11433 : gsi, stmt_info, bump);
11434 : }
11435 :
11436 : /* 2. Create the vector-load in the loop. */
11437 913392 : switch (alignment_support_scheme)
11438 : {
11439 913392 : case dr_aligned:
11440 913392 : case dr_unaligned_supported:
11441 913392 : {
11442 913392 : if (costing_p)
11443 : break;
11444 :
11445 253908 : unsigned int misalign;
11446 253908 : unsigned HOST_WIDE_INT align;
11447 253908 : align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
11448 253908 : if (alignment_support_scheme == dr_aligned)
11449 : misalign = 0;
11450 162864 : else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
11451 : {
11452 123469 : align = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
11453 123469 : misalign = 0;
11454 : }
11455 : else
11456 39395 : misalign = misalignment;
11457 253908 : if (dataref_offset == NULL_TREE
11458 251781 : && TREE_CODE (dataref_ptr) == SSA_NAME)
11459 171229 : set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
11460 : misalign);
11461 253908 : align = least_bit_hwi (misalign | align);
11462 :
11463 : /* Compute IFN when LOOP_LENS or final_mask valid. */
11464 253908 : machine_mode vmode = TYPE_MODE (vectype);
11465 253908 : machine_mode new_vmode = vmode;
11466 253908 : internal_fn partial_ifn = IFN_LAST;
11467 253908 : if (loop_lens)
11468 : {
11469 0 : opt_machine_mode new_ovmode
11470 0 : = get_len_load_store_mode (vmode, true, &partial_ifn);
11471 0 : new_vmode = new_ovmode.require ();
11472 0 : unsigned factor
11473 0 : = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
11474 0 : final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
11475 : vec_num, vectype, i, factor, true);
11476 : }
11477 253908 : else if (final_mask)
11478 : {
11479 737 : if (!can_vec_mask_load_store_p (vmode,
11480 737 : TYPE_MODE
11481 : (TREE_TYPE (final_mask)),
11482 : true, &partial_ifn))
11483 0 : gcc_unreachable ();
11484 : }
11485 :
11486 253908 : if (partial_ifn == IFN_MASK_LEN_LOAD)
11487 : {
11488 0 : if (!final_len)
11489 : {
11490 : /* Pass VF value to 'len' argument of
11491 : MASK_LEN_LOAD if LOOP_LENS is invalid. */
11492 0 : final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
11493 : }
11494 0 : if (!final_mask)
11495 : {
11496 : /* Pass all ones value to 'mask' argument of
11497 : MASK_LEN_LOAD if final_mask is invalid. */
11498 0 : mask_vectype = truth_type_for (vectype);
11499 0 : final_mask = build_minus_one_cst (mask_vectype);
11500 : }
11501 : }
11502 253908 : if (final_len)
11503 : {
11504 0 : signed char biasval
11505 0 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
11506 0 : bias = build_int_cst (intQI_type_node, biasval);
11507 : }
11508 :
11509 253908 : tree vec_els;
11510 :
11511 253908 : if (final_len)
11512 : {
11513 0 : tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
11514 0 : gcall *call;
11515 :
11516 : /* Need conversion if the vectype is punned by VnQI. */
11517 0 : els_vectype = vectype;
11518 0 : if (vmode != new_vmode)
11519 0 : els_vectype
11520 0 : = build_vector_type_for_mode (unsigned_intQI_type_node,
11521 : new_vmode);
11522 0 : vec_els = vect_get_mask_load_else (maskload_elsval,
11523 : els_vectype);
11524 :
11525 0 : if (partial_ifn == IFN_MASK_LEN_LOAD)
11526 : {
11527 0 : if (type_mode_padding_p
11528 0 : && maskload_elsval != MASK_LOAD_ELSE_ZERO)
11529 0 : need_zeroing = true;
11530 0 : call = gimple_build_call_internal (IFN_MASK_LEN_LOAD,
11531 : 6, dataref_ptr, ptr,
11532 : final_mask, vec_els,
11533 : final_len, bias);
11534 : }
11535 : else
11536 0 : call = gimple_build_call_internal (IFN_LEN_LOAD, 5,
11537 : dataref_ptr, ptr,
11538 : vec_els, final_len,
11539 : bias);
11540 0 : gimple_call_set_nothrow (call, true);
11541 0 : new_stmt = call;
11542 0 : data_ref = NULL_TREE;
11543 :
11544 : /* Need conversion if it's wrapped with VnQI. */
11545 0 : if (vmode != new_vmode)
11546 : {
11547 0 : tree new_vtype
11548 0 : = build_vector_type_for_mode (unsigned_intQI_type_node,
11549 : new_vmode);
11550 0 : tree var = vect_get_new_ssa_name (new_vtype,
11551 : vect_simple_var);
11552 0 : gimple_set_lhs (call, var);
11553 0 : vect_finish_stmt_generation (vinfo, stmt_info, call,
11554 : gsi);
11555 0 : tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
11556 0 : new_stmt = gimple_build_assign (vec_dest,
11557 : VIEW_CONVERT_EXPR, op);
11558 : }
11559 : }
11560 253908 : else if (final_mask)
11561 : {
11562 737 : tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
11563 737 : vec_els = vect_get_mask_load_else (maskload_elsval, vectype);
11564 737 : if (type_mode_padding_p
11565 737 : && maskload_elsval != MASK_LOAD_ELSE_ZERO)
11566 0 : need_zeroing = true;
11567 737 : gcall *call = gimple_build_call_internal (IFN_MASK_LOAD, 4,
11568 : dataref_ptr, ptr,
11569 : final_mask,
11570 : vec_els);
11571 737 : gimple_call_set_nothrow (call, true);
11572 737 : new_stmt = call;
11573 737 : data_ref = NULL_TREE;
11574 : }
11575 : else
11576 : {
11577 253171 : tree ltype = vectype;
11578 253171 : tree new_vtype = NULL_TREE;
11579 253171 : unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
11580 253171 : unsigned HOST_WIDE_INT dr_size
11581 253171 : = vect_get_scalar_dr_size (first_dr_info);
11582 253171 : poly_int64 off = 0;
11583 253171 : if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
11584 1445 : off = (TYPE_VECTOR_SUBPARTS (vectype) - 1) * -dr_size;
11585 253171 : unsigned int vect_align
11586 253171 : = vect_known_alignment_in_bytes (first_dr_info, vectype,
11587 253171 : off);
11588 : /* Try to use a single smaller load when we are about
11589 : to load excess elements compared to the unrolled
11590 : scalar loop. */
11591 253171 : if (known_gt ((i + 1) * nunits,
11592 : (group_size * vf - gap)))
11593 : {
11594 6949 : poly_uint64 remain = ((group_size * vf - gap) - i * nunits);
11595 6949 : if (known_ge ((i + 1) * nunits - (group_size * vf - gap),
11596 : nunits))
11597 : /* DR will be unused. */
11598 : ltype = NULL_TREE;
11599 2290 : else if (known_ge (vect_align,
11600 : tree_to_poly_uint64
11601 : (TYPE_SIZE_UNIT (vectype))))
11602 : /* Aligned access to excess elements is OK if
11603 : at least one element is accessed in the
11604 : scalar loop. */
11605 : ;
11606 1899 : else if (known_gt (vect_align,
11607 : ((nunits - remain) * dr_size)))
11608 : /* Aligned access to the gap area when there's
11609 : at least one element in it is OK. */
11610 : ;
11611 : else
11612 : {
11613 : /* remain should now be > 0 and < nunits. */
11614 1896 : unsigned num;
11615 1896 : if (known_ne (remain, 0u)
11616 1896 : && constant_multiple_p (nunits, remain, &num))
11617 : {
11618 1433 : tree ptype;
11619 1433 : new_vtype
11620 1433 : = vector_vector_composition_type (vectype, num,
11621 : &ptype);
11622 1433 : if (new_vtype)
11623 1433 : ltype = ptype;
11624 : }
11625 : /* Else use multiple loads or a masked load? */
11626 : /* For loop vectorization we now should have
11627 : an alternate type or LOOP_VINFO_PEELING_FOR_GAPS
11628 : set. */
11629 1896 : if (loop_vinfo)
11630 1645 : gcc_assert (new_vtype
11631 : || LOOP_VINFO_PEELING_FOR_GAPS
11632 : (loop_vinfo));
11633 : /* But still reduce the access size to the next
11634 : required power-of-two so peeling a single
11635 : scalar iteration is sufficient. */
11636 1896 : unsigned HOST_WIDE_INT cremain;
11637 1896 : if (remain.is_constant (&cremain))
11638 : {
11639 1896 : unsigned HOST_WIDE_INT cpart_size
11640 1896 : = 1 << ceil_log2 (cremain);
11641 1896 : if (known_gt (nunits, cpart_size)
11642 1896 : && constant_multiple_p (nunits, cpart_size,
11643 : &num))
11644 : {
11645 1445 : tree ptype;
11646 1445 : new_vtype
11647 2890 : = vector_vector_composition_type (vectype,
11648 1445 : num,
11649 : &ptype);
11650 1445 : if (new_vtype)
11651 1445 : ltype = ptype;
11652 : }
11653 : }
11654 : }
11655 : }
11656 253171 : tree offset = (dataref_offset ? dataref_offset
11657 251044 : : build_int_cst (ref_type, 0));
11658 253171 : if (!ltype)
11659 : ;
11660 248512 : else if (ltype != vectype
11661 248512 : && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
11662 : {
11663 25 : poly_uint64 gap_offset
11664 25 : = (tree_to_poly_uint64 (TYPE_SIZE_UNIT (vectype))
11665 25 : - tree_to_poly_uint64 (TYPE_SIZE_UNIT (ltype)));
11666 25 : tree gapcst = build_int_cstu (ref_type, gap_offset);
11667 25 : offset = size_binop (PLUS_EXPR, offset, gapcst);
11668 : }
11669 253171 : if (ltype)
11670 : {
11671 248512 : data_ref = fold_build2 (MEM_REF, ltype,
11672 : dataref_ptr, offset);
11673 248512 : if (alignment_support_scheme == dr_aligned
11674 248512 : && align >= TYPE_ALIGN_UNIT (ltype))
11675 : ;
11676 : else
11677 161158 : TREE_TYPE (data_ref)
11678 322316 : = build_aligned_type (TREE_TYPE (data_ref),
11679 : align * BITS_PER_UNIT);
11680 : }
11681 253171 : if (!ltype)
11682 4659 : data_ref = build_constructor (vectype, NULL);
11683 248512 : else if (ltype != vectype)
11684 : {
11685 1445 : vect_copy_ref_info (data_ref,
11686 1445 : DR_REF (first_dr_info->dr));
11687 1445 : tree tem = make_ssa_name (ltype);
11688 1445 : new_stmt = gimple_build_assign (tem, data_ref);
11689 1445 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
11690 : gsi);
11691 1445 : data_ref = NULL;
11692 1445 : vec<constructor_elt, va_gc> *v;
11693 : /* We've computed 'num' above to statically two
11694 : or via constant_multiple_p. */
11695 1445 : unsigned num
11696 1445 : = (exact_div (tree_to_poly_uint64
11697 1445 : (TYPE_SIZE_UNIT (vectype)),
11698 : tree_to_poly_uint64
11699 1445 : (TYPE_SIZE_UNIT (ltype)))
11700 1445 : .to_constant ());
11701 1445 : vec_alloc (v, num);
11702 1445 : if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
11703 : {
11704 62 : while (--num)
11705 62 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
11706 : build_zero_cst (ltype));
11707 25 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
11708 : }
11709 : else
11710 : {
11711 1420 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
11712 1420 : while (--num)
11713 3194 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
11714 : build_zero_cst (ltype));
11715 : }
11716 1445 : gcc_assert (new_vtype != NULL_TREE);
11717 1445 : if (new_vtype == vectype)
11718 1413 : new_stmt
11719 1413 : = gimple_build_assign (vec_dest,
11720 : build_constructor (vectype, v));
11721 : else
11722 : {
11723 32 : tree new_vname = make_ssa_name (new_vtype);
11724 32 : new_stmt
11725 32 : = gimple_build_assign (new_vname,
11726 : build_constructor (new_vtype,
11727 : v));
11728 32 : vect_finish_stmt_generation (vinfo, stmt_info,
11729 : new_stmt, gsi);
11730 32 : new_stmt
11731 32 : = gimple_build_assign (vec_dest,
11732 : build1 (VIEW_CONVERT_EXPR,
11733 : vectype, new_vname));
11734 : }
11735 : }
11736 : }
11737 : break;
11738 : }
11739 0 : case dr_explicit_realign:
11740 0 : {
11741 0 : if (costing_p)
11742 : break;
11743 0 : tree ptr, bump;
11744 :
11745 0 : tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
11746 :
11747 0 : if (compute_in_loop)
11748 0 : msq = vect_setup_realignment (vinfo, first_stmt_info, vectype,
11749 : gsi, &realignment_token,
11750 : dr_explicit_realign,
11751 : dataref_ptr, NULL);
11752 :
11753 0 : if (TREE_CODE (dataref_ptr) == SSA_NAME)
11754 0 : ptr = copy_ssa_name (dataref_ptr);
11755 : else
11756 0 : ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
11757 : // For explicit realign the target alignment should be
11758 : // known at compile time.
11759 0 : unsigned HOST_WIDE_INT align
11760 0 : = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
11761 0 : new_stmt = gimple_build_assign (ptr, BIT_AND_EXPR, dataref_ptr,
11762 : build_int_cst
11763 0 : (TREE_TYPE (dataref_ptr),
11764 0 : -(HOST_WIDE_INT) align));
11765 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11766 0 : data_ref = build2 (MEM_REF, vectype,
11767 : ptr, build_int_cst (ref_type, 0));
11768 0 : vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
11769 0 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
11770 0 : new_stmt = gimple_build_assign (vec_dest, data_ref);
11771 0 : new_temp = make_ssa_name (vec_dest, new_stmt);
11772 0 : gimple_assign_set_lhs (new_stmt, new_temp);
11773 0 : gimple_move_vops (new_stmt, stmt_info->stmt);
11774 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11775 0 : msq = new_temp;
11776 :
11777 0 : bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type));
11778 0 : bump = size_binop (MINUS_EXPR, bump, size_one_node);
11779 0 : ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, stmt_info,
11780 : bump);
11781 0 : new_stmt = gimple_build_assign (NULL_TREE, BIT_AND_EXPR, ptr,
11782 0 : build_int_cst (TREE_TYPE (ptr),
11783 0 : -(HOST_WIDE_INT) align));
11784 0 : if (TREE_CODE (ptr) == SSA_NAME)
11785 0 : ptr = copy_ssa_name (ptr, new_stmt);
11786 : else
11787 0 : ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
11788 0 : gimple_assign_set_lhs (new_stmt, ptr);
11789 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11790 0 : data_ref = build2 (MEM_REF, vectype,
11791 : ptr, build_int_cst (ref_type, 0));
11792 0 : break;
11793 : }
11794 0 : case dr_explicit_realign_optimized:
11795 0 : {
11796 0 : if (costing_p)
11797 : break;
11798 0 : if (TREE_CODE (dataref_ptr) == SSA_NAME)
11799 0 : new_temp = copy_ssa_name (dataref_ptr);
11800 : else
11801 0 : new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
11802 : // We should only be doing this if we know the target
11803 : // alignment at compile time.
11804 0 : unsigned HOST_WIDE_INT align
11805 0 : = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
11806 0 : new_stmt = gimple_build_assign (new_temp, BIT_AND_EXPR, dataref_ptr,
11807 0 : build_int_cst (TREE_TYPE (dataref_ptr),
11808 0 : -(HOST_WIDE_INT) align));
11809 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11810 0 : data_ref = build2 (MEM_REF, vectype, new_temp,
11811 : build_int_cst (ref_type, 0));
11812 0 : break;
11813 : }
11814 0 : default:
11815 0 : gcc_unreachable ();
11816 : }
11817 :
11818 : /* One common place to cost the above vect load for different
11819 : alignment support schemes. */
11820 913392 : if (costing_p)
11821 : {
11822 : /* For the prologue cost for realign,
11823 : we only need to count it once for the whole group. */
11824 659484 : bool first_stmt_info_p = first_stmt_info == stmt_info;
11825 659484 : bool add_realign_cost = first_stmt_info_p && i == 0;
11826 659484 : if (memory_access_type == VMAT_CONTIGUOUS
11827 659484 : || memory_access_type == VMAT_CONTIGUOUS_REVERSE)
11828 : {
11829 : /* Leave realign cases alone to keep them simple. */
11830 659484 : if (alignment_support_scheme == dr_explicit_realign_optimized
11831 : || alignment_support_scheme == dr_explicit_realign)
11832 0 : vect_get_load_cost (vinfo, stmt_info, slp_node, 1,
11833 : alignment_support_scheme, misalignment,
11834 : add_realign_cost, &inside_cost,
11835 : &prologue_cost, cost_vec, cost_vec,
11836 : true);
11837 : else
11838 659484 : n_adjacent_loads++;
11839 : }
11840 : }
11841 : else
11842 : {
11843 253908 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
11844 : /* DATA_REF is null if we've already built the statement. */
11845 253908 : if (data_ref)
11846 : {
11847 251726 : vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
11848 251726 : new_stmt = gimple_build_assign (vec_dest, data_ref);
11849 : }
11850 :
11851 507816 : new_temp = (need_zeroing
11852 253908 : ? make_ssa_name (vectype)
11853 253908 : : make_ssa_name (vec_dest, new_stmt));
11854 253908 : gimple_set_lhs (new_stmt, new_temp);
11855 253908 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11856 :
11857 : /* If we need to explicitly zero inactive elements emit a
11858 : VEC_COND_EXPR that does so. */
11859 253908 : if (need_zeroing)
11860 : {
11861 0 : vec_els = vect_get_mask_load_else (MASK_LOAD_ELSE_ZERO,
11862 : vectype);
11863 :
11864 0 : tree new_temp2 = make_ssa_name (vec_dest, new_stmt);
11865 0 : new_stmt = gimple_build_assign (new_temp2, VEC_COND_EXPR,
11866 : final_mask, new_temp, vec_els);
11867 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
11868 : gsi);
11869 0 : new_temp = new_temp2;
11870 : }
11871 : }
11872 :
11873 : /* 3. Handle explicit realignment if necessary/supported.
11874 : Create in loop:
11875 : vec_dest = realign_load (msq, lsq, realignment_token) */
11876 913392 : if (!costing_p
11877 253908 : && (alignment_support_scheme == dr_explicit_realign_optimized
11878 : || alignment_support_scheme == dr_explicit_realign))
11879 : {
11880 0 : lsq = gimple_assign_lhs (new_stmt);
11881 0 : if (!realignment_token)
11882 0 : realignment_token = dataref_ptr;
11883 0 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
11884 0 : new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, msq,
11885 : lsq, realignment_token);
11886 0 : new_temp = make_ssa_name (vec_dest, new_stmt);
11887 0 : gimple_assign_set_lhs (new_stmt, new_temp);
11888 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
11889 :
11890 0 : if (alignment_support_scheme == dr_explicit_realign_optimized)
11891 : {
11892 0 : gcc_assert (phi);
11893 0 : if (i == vec_num - 1)
11894 0 : add_phi_arg (phi, lsq, loop_latch_edge (containing_loop),
11895 : UNKNOWN_LOCATION);
11896 : msq = lsq;
11897 : }
11898 : }
11899 :
11900 913392 : if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
11901 : {
11902 5932 : if (costing_p)
11903 4487 : inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
11904 : slp_node, 0, vect_body);
11905 : else
11906 : {
11907 1445 : tree perm_mask = perm_mask_for_reverse (vectype);
11908 1445 : new_temp = permute_vec_elements (vinfo, new_temp, new_temp,
11909 : perm_mask, stmt_info, gsi);
11910 1445 : new_stmt = SSA_NAME_DEF_STMT (new_temp);
11911 : }
11912 : }
11913 :
11914 : /* Collect vector loads and later create their permutation in
11915 : vect_transform_slp_perm_load. */
11916 913392 : if (!costing_p && (grouped_load || ls.slp_perm))
11917 73503 : dr_chain.quick_push (new_temp);
11918 :
11919 : /* Store vector loads in the corresponding SLP_NODE. */
11920 253908 : if (!costing_p && !ls.slp_perm)
11921 180405 : slp_node->push_vec_def (new_stmt);
11922 :
11923 : /* With SLP permutation we load the gaps as well, without
11924 : we need to skip the gaps after we manage to fully load
11925 : all elements. group_gap_adj is DR_GROUP_SIZE here. */
11926 913392 : group_elt += nunits;
11927 913392 : if (!costing_p
11928 253908 : && maybe_ne (group_gap_adj, 0U)
11929 46164 : && !ls.slp_perm
11930 934677 : && known_eq (group_elt, group_size - group_gap_adj))
11931 : {
11932 16623 : poly_wide_int bump_val
11933 16623 : = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
11934 16623 : if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) == -1)
11935 0 : bump_val = -bump_val;
11936 16623 : tree bump = wide_int_to_tree (sizetype, bump_val);
11937 16623 : dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
11938 : stmt_info, bump);
11939 16623 : group_elt = 0;
11940 16623 : }
11941 : }
11942 : /* Bump the vector pointer to account for a gap or for excess
11943 : elements loaded for a permuted SLP load. */
11944 570897 : if (!costing_p
11945 161649 : && maybe_ne (group_gap_adj, 0U)
11946 587974 : && ls.slp_perm)
11947 : {
11948 454 : poly_wide_int bump_val
11949 454 : = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
11950 454 : if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) == -1)
11951 9 : bump_val = -bump_val;
11952 454 : tree bump = wide_int_to_tree (sizetype, bump_val);
11953 454 : dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
11954 : stmt_info, bump);
11955 454 : }
11956 :
11957 570897 : if (ls.slp_perm)
11958 : {
11959 : /* For SLP we know we've seen all possible uses of dr_chain so
11960 : direct vect_transform_slp_perm_load to DCE the unused parts.
11961 : ??? This is a hack to prevent compile-time issues as seen
11962 : in PR101120 and friends. */
11963 52994 : if (costing_p)
11964 : {
11965 35920 : gcc_assert (ls.n_perms != -1U && ls.n_loads != -1U);
11966 35920 : if (ls.n_perms != 0)
11967 35413 : inside_cost = record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
11968 : slp_node, 0, vect_body);
11969 35920 : if (n_adjacent_loads > 0)
11970 35920 : n_adjacent_loads = ls.n_loads;
11971 : }
11972 : else
11973 : {
11974 17074 : unsigned n_perms2, n_loads2;
11975 17074 : bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
11976 : gsi, vf, false, &n_perms2,
11977 : &n_loads2, true);
11978 17074 : gcc_assert (ok && ls.n_perms == n_perms2 && ls.n_loads == n_loads2);
11979 : }
11980 : }
11981 :
11982 570897 : if (costing_p)
11983 : {
11984 409248 : gcc_assert (memory_access_type == VMAT_CONTIGUOUS
11985 : || memory_access_type == VMAT_CONTIGUOUS_REVERSE);
11986 409248 : if (n_adjacent_loads > 0)
11987 409248 : vect_get_load_cost (vinfo, stmt_info, slp_node, n_adjacent_loads,
11988 : alignment_support_scheme, misalignment, false,
11989 : &inside_cost, &prologue_cost, cost_vec, cost_vec,
11990 : true);
11991 409248 : if (dump_enabled_p ())
11992 24108 : dump_printf_loc (MSG_NOTE, vect_location,
11993 : "vect_model_load_cost: inside_cost = %u, "
11994 : "prologue_cost = %u .\n",
11995 : inside_cost, prologue_cost);
11996 409248 : SLP_TREE_TYPE (slp_node) = load_vec_info_type;
11997 409248 : slp_node->data = new vect_load_store_data (std::move (ls));
11998 : }
11999 :
12000 570897 : return true;
12001 1852610 : }
12002 :
12003 : /* Function vect_is_simple_cond.
12004 :
12005 : Input:
12006 : LOOP - the loop that is being vectorized.
12007 : COND - Condition that is checked for simple use.
12008 :
12009 : Output:
12010 : *COMP_VECTYPE - the vector type for the comparison.
12011 : *DTS - The def types for the arguments of the comparison
12012 :
12013 : Returns whether a COND can be vectorized. Checks whether
12014 : condition operands are supportable using vec_is_simple_use. */
12015 :
12016 : static bool
12017 34899 : vect_is_simple_cond (tree cond, vec_info *vinfo,
12018 : slp_tree slp_node, tree *comp_vectype,
12019 : enum vect_def_type *dts, tree vectype)
12020 : {
12021 34899 : tree lhs, rhs;
12022 34899 : tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
12023 34899 : slp_tree slp_op;
12024 :
12025 : /* Mask case. */
12026 34899 : if (TREE_CODE (cond) == SSA_NAME
12027 34899 : && VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (cond)))
12028 : {
12029 34887 : if (!vect_is_simple_use (vinfo, slp_node, 0, &cond,
12030 : &slp_op, &dts[0], comp_vectype)
12031 34887 : || !*comp_vectype
12032 69763 : || !VECTOR_BOOLEAN_TYPE_P (*comp_vectype))
12033 : return false;
12034 : return true;
12035 : }
12036 :
12037 12 : if (!COMPARISON_CLASS_P (cond))
12038 : return false;
12039 :
12040 0 : lhs = TREE_OPERAND (cond, 0);
12041 0 : rhs = TREE_OPERAND (cond, 1);
12042 :
12043 0 : if (TREE_CODE (lhs) == SSA_NAME)
12044 : {
12045 0 : if (!vect_is_simple_use (vinfo, slp_node, 0,
12046 : &lhs, &slp_op, &dts[0], &vectype1))
12047 : return false;
12048 : }
12049 0 : else if (TREE_CODE (lhs) == INTEGER_CST || TREE_CODE (lhs) == REAL_CST
12050 0 : || TREE_CODE (lhs) == FIXED_CST)
12051 0 : dts[0] = vect_constant_def;
12052 : else
12053 : return false;
12054 :
12055 0 : if (TREE_CODE (rhs) == SSA_NAME)
12056 : {
12057 0 : if (!vect_is_simple_use (vinfo, slp_node, 1,
12058 : &rhs, &slp_op, &dts[1], &vectype2))
12059 : return false;
12060 : }
12061 0 : else if (TREE_CODE (rhs) == INTEGER_CST || TREE_CODE (rhs) == REAL_CST
12062 0 : || TREE_CODE (rhs) == FIXED_CST)
12063 0 : dts[1] = vect_constant_def;
12064 : else
12065 : return false;
12066 :
12067 0 : if (vectype1 && vectype2
12068 0 : && maybe_ne (TYPE_VECTOR_SUBPARTS (vectype1),
12069 0 : TYPE_VECTOR_SUBPARTS (vectype2)))
12070 0 : return false;
12071 :
12072 0 : *comp_vectype = vectype1 ? vectype1 : vectype2;
12073 : /* Invariant comparison. */
12074 0 : if (! *comp_vectype)
12075 : {
12076 0 : tree scalar_type = TREE_TYPE (lhs);
12077 0 : if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type))
12078 0 : *comp_vectype = truth_type_for (vectype);
12079 : else
12080 : {
12081 : /* If we can widen the comparison to match vectype do so. */
12082 0 : if (INTEGRAL_TYPE_P (scalar_type)
12083 0 : && !slp_node
12084 0 : && tree_int_cst_lt (TYPE_SIZE (scalar_type),
12085 0 : TYPE_SIZE (TREE_TYPE (vectype))))
12086 0 : scalar_type = build_nonstandard_integer_type
12087 0 : (vector_element_bits (vectype), TYPE_UNSIGNED (scalar_type));
12088 0 : *comp_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
12089 : slp_node);
12090 : }
12091 : }
12092 :
12093 : return true;
12094 : }
12095 :
12096 : /* vectorizable_condition.
12097 :
12098 : Check if STMT_INFO is conditional modify expression that can be vectorized.
12099 : If COST_VEC is passed, calculate costs but don't change anything,
12100 : otherwise, vectorize STMT_INFO: create a vectorized stmt using
12101 : VEC_COND_EXPR to replace it, and insert it at GSI.
12102 :
12103 : When STMT_INFO is vectorized as a nested cycle, for_reduction is true.
12104 :
12105 : Return true if STMT_INFO is vectorizable in this way. */
12106 :
12107 : static bool
12108 676283 : vectorizable_condition (vec_info *vinfo,
12109 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
12110 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
12111 : {
12112 676283 : tree scalar_dest = NULL_TREE;
12113 676283 : tree vec_dest = NULL_TREE;
12114 676283 : tree cond_expr, cond_expr0 = NULL_TREE, cond_expr1 = NULL_TREE;
12115 676283 : tree then_clause, else_clause;
12116 676283 : tree comp_vectype = NULL_TREE;
12117 676283 : tree vec_cond_lhs = NULL_TREE, vec_cond_rhs = NULL_TREE;
12118 676283 : tree vec_then_clause = NULL_TREE, vec_else_clause = NULL_TREE;
12119 676283 : tree vec_compare;
12120 676283 : tree new_temp;
12121 676283 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
12122 676283 : enum vect_def_type dts[4]
12123 : = {vect_unknown_def_type, vect_unknown_def_type,
12124 : vect_unknown_def_type, vect_unknown_def_type};
12125 676283 : enum tree_code code, cond_code, bitop1 = NOP_EXPR, bitop2 = NOP_EXPR;
12126 676283 : int i;
12127 676283 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
12128 676283 : vec<tree> vec_oprnds0 = vNULL;
12129 676283 : vec<tree> vec_oprnds1 = vNULL;
12130 676283 : vec<tree> vec_oprnds2 = vNULL;
12131 676283 : vec<tree> vec_oprnds3 = vNULL;
12132 676283 : tree vec_cmp_type;
12133 676283 : bool masked = false;
12134 :
12135 676283 : if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
12136 : return false;
12137 :
12138 : /* Is vectorizable conditional operation? */
12139 1020128 : gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
12140 378715 : if (!stmt)
12141 : return false;
12142 :
12143 378715 : code = gimple_assign_rhs_code (stmt);
12144 378715 : if (code != COND_EXPR)
12145 : return false;
12146 :
12147 34899 : int reduc_index = SLP_TREE_REDUC_IDX (slp_node);
12148 34899 : vect_reduction_type reduction_type = TREE_CODE_REDUCTION;
12149 34899 : bool nested_cycle_p = false;
12150 34899 : bool for_reduction = vect_is_reduction (stmt_info);
12151 34899 : if (for_reduction)
12152 : {
12153 614 : if (SLP_TREE_LANES (slp_node) > 1)
12154 : return false;
12155 : /* ??? With a reduction path we do not get at the reduction info from
12156 : every stmt, use the conservative default setting then. */
12157 694 : if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
12158 : {
12159 596 : vect_reduc_info reduc_info
12160 596 : = info_for_reduction (loop_vinfo, slp_node);
12161 596 : reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
12162 596 : nested_cycle_p = nested_in_vect_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
12163 : stmt_info);
12164 : }
12165 : }
12166 : else
12167 : {
12168 34285 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
12169 : return false;
12170 : }
12171 :
12172 34899 : tree vectype = SLP_TREE_VECTYPE (slp_node);
12173 34899 : tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
12174 :
12175 34899 : int vec_num = vect_get_num_copies (vinfo, slp_node);
12176 :
12177 34899 : cond_expr = gimple_assign_rhs1 (stmt);
12178 34899 : gcc_assert (! COMPARISON_CLASS_P (cond_expr));
12179 :
12180 34899 : if (!vect_is_simple_cond (cond_expr, vinfo, slp_node,
12181 : &comp_vectype, &dts[0], vectype)
12182 34899 : || !comp_vectype)
12183 : return false;
12184 :
12185 34876 : unsigned op_adjust = COMPARISON_CLASS_P (cond_expr) ? 1 : 0;
12186 34876 : slp_tree then_slp_node, else_slp_node;
12187 34876 : if (!vect_is_simple_use (vinfo, slp_node, 1 + op_adjust,
12188 : &then_clause, &then_slp_node, &dts[2], &vectype1))
12189 : return false;
12190 34876 : if (!vect_is_simple_use (vinfo, slp_node, 2 + op_adjust,
12191 : &else_clause, &else_slp_node, &dts[3], &vectype2))
12192 : return false;
12193 :
12194 34876 : if (vectype1 && !useless_type_conversion_p (vectype, vectype1))
12195 : return false;
12196 :
12197 34876 : if (vectype2 && !useless_type_conversion_p (vectype, vectype2))
12198 : return false;
12199 :
12200 34876 : masked = !COMPARISON_CLASS_P (cond_expr);
12201 34876 : vec_cmp_type = truth_type_for (comp_vectype);
12202 34876 : if (vec_cmp_type == NULL_TREE
12203 69752 : || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype),
12204 34876 : TYPE_VECTOR_SUBPARTS (vec_cmp_type)))
12205 0 : return false;
12206 :
12207 34876 : cond_code = TREE_CODE (cond_expr);
12208 34876 : if (!masked)
12209 : {
12210 0 : cond_expr0 = TREE_OPERAND (cond_expr, 0);
12211 0 : cond_expr1 = TREE_OPERAND (cond_expr, 1);
12212 : }
12213 :
12214 : /* For conditional reductions, the "then" value needs to be the candidate
12215 : value calculated by this iteration while the "else" value needs to be
12216 : the result carried over from previous iterations. If the COND_EXPR
12217 : is the other way around, we need to swap it. */
12218 34876 : bool must_invert_cmp_result = false;
12219 34876 : if (reduction_type == EXTRACT_LAST_REDUCTION && reduc_index == 1)
12220 : {
12221 0 : if (masked)
12222 0 : must_invert_cmp_result = true;
12223 : else
12224 : {
12225 0 : bool honor_nans = HONOR_NANS (TREE_TYPE (cond_expr0));
12226 0 : tree_code new_code = invert_tree_comparison (cond_code, honor_nans);
12227 0 : if (new_code == ERROR_MARK)
12228 : must_invert_cmp_result = true;
12229 : else
12230 : {
12231 0 : cond_code = new_code;
12232 : /* Make sure we don't accidentally use the old condition. */
12233 0 : cond_expr = NULL_TREE;
12234 : }
12235 : }
12236 : /* ??? The vectorized operand query below doesn't allow swapping
12237 : this way for SLP. */
12238 0 : return false;
12239 : /* std::swap (then_clause, else_clause); */
12240 : }
12241 :
12242 34876 : if (!masked && VECTOR_BOOLEAN_TYPE_P (comp_vectype))
12243 : {
12244 : /* Boolean values may have another representation in vectors
12245 : and therefore we prefer bit operations over comparison for
12246 : them (which also works for scalar masks). We store opcodes
12247 : to use in bitop1 and bitop2. Statement is vectorized as
12248 : BITOP2 (rhs1 BITOP1 rhs2) or rhs1 BITOP2 (BITOP1 rhs2)
12249 : depending on bitop1 and bitop2 arity. */
12250 0 : switch (cond_code)
12251 : {
12252 : case GT_EXPR:
12253 : bitop1 = BIT_NOT_EXPR;
12254 : bitop2 = BIT_AND_EXPR;
12255 : break;
12256 0 : case GE_EXPR:
12257 0 : bitop1 = BIT_NOT_EXPR;
12258 0 : bitop2 = BIT_IOR_EXPR;
12259 0 : break;
12260 0 : case LT_EXPR:
12261 0 : bitop1 = BIT_NOT_EXPR;
12262 0 : bitop2 = BIT_AND_EXPR;
12263 0 : std::swap (cond_expr0, cond_expr1);
12264 0 : break;
12265 0 : case LE_EXPR:
12266 0 : bitop1 = BIT_NOT_EXPR;
12267 0 : bitop2 = BIT_IOR_EXPR;
12268 0 : std::swap (cond_expr0, cond_expr1);
12269 0 : break;
12270 0 : case NE_EXPR:
12271 0 : bitop1 = BIT_XOR_EXPR;
12272 0 : break;
12273 0 : case EQ_EXPR:
12274 0 : bitop1 = BIT_XOR_EXPR;
12275 0 : bitop2 = BIT_NOT_EXPR;
12276 0 : break;
12277 : default:
12278 : return false;
12279 : }
12280 : cond_code = SSA_NAME;
12281 : }
12282 :
12283 34876 : if (TREE_CODE_CLASS (cond_code) == tcc_comparison
12284 0 : && reduction_type == EXTRACT_LAST_REDUCTION
12285 34876 : && !expand_vec_cmp_expr_p (comp_vectype, vec_cmp_type, cond_code))
12286 : {
12287 0 : if (dump_enabled_p ())
12288 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12289 : "reduction comparison operation not supported.\n");
12290 0 : return false;
12291 : }
12292 :
12293 34876 : if (cost_vec)
12294 : {
12295 26326 : if (bitop1 != NOP_EXPR)
12296 : {
12297 0 : machine_mode mode = TYPE_MODE (comp_vectype);
12298 0 : optab optab;
12299 :
12300 0 : optab = optab_for_tree_code (bitop1, comp_vectype, optab_default);
12301 0 : if (!optab || !can_implement_p (optab, mode))
12302 0 : return false;
12303 :
12304 0 : if (bitop2 != NOP_EXPR)
12305 : {
12306 0 : optab = optab_for_tree_code (bitop2, comp_vectype,
12307 : optab_default);
12308 0 : if (!optab || !can_implement_p (optab, mode))
12309 0 : return false;
12310 : }
12311 : }
12312 :
12313 26326 : vect_cost_for_stmt kind = vector_stmt;
12314 26326 : if (reduction_type == EXTRACT_LAST_REDUCTION)
12315 : /* Count one reduction-like operation per vector. */
12316 : kind = vec_to_scalar;
12317 26326 : else if ((masked && !expand_vec_cond_expr_p (vectype, comp_vectype))
12318 26326 : || (!masked
12319 0 : && (!expand_vec_cmp_expr_p (comp_vectype, vec_cmp_type,
12320 : cond_code)
12321 0 : || !expand_vec_cond_expr_p (vectype, vec_cmp_type))))
12322 6 : return false;
12323 :
12324 26320 : if (!vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
12325 : comp_vectype)
12326 26320 : || (op_adjust == 1
12327 0 : && !vect_maybe_update_slp_op_vectype
12328 0 : (SLP_TREE_CHILDREN (slp_node)[1], comp_vectype))
12329 26320 : || !vect_maybe_update_slp_op_vectype (then_slp_node, vectype)
12330 52640 : || !vect_maybe_update_slp_op_vectype (else_slp_node, vectype))
12331 : {
12332 0 : if (dump_enabled_p ())
12333 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12334 : "incompatible vector types for invariants\n");
12335 0 : return false;
12336 : }
12337 :
12338 26320 : if (loop_vinfo && for_reduction
12339 447 : && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
12340 : {
12341 68 : if (reduction_type == EXTRACT_LAST_REDUCTION)
12342 : {
12343 0 : if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
12344 : vectype, OPTIMIZE_FOR_SPEED))
12345 0 : vect_record_loop_len (loop_vinfo,
12346 : &LOOP_VINFO_LENS (loop_vinfo),
12347 : vec_num, vectype, 1);
12348 : else
12349 0 : vect_record_loop_mask (loop_vinfo,
12350 : &LOOP_VINFO_MASKS (loop_vinfo),
12351 : vec_num, vectype, NULL);
12352 : }
12353 : /* Extra inactive lanes should be safe for vect_nested_cycle. */
12354 68 : else if (!nested_cycle_p)
12355 : {
12356 68 : if (dump_enabled_p ())
12357 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12358 : "conditional reduction prevents the use"
12359 : " of partial vectors.\n");
12360 68 : LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
12361 : }
12362 : }
12363 :
12364 26320 : SLP_TREE_TYPE (slp_node) = condition_vec_info_type;
12365 26320 : vect_model_simple_cost (vinfo, 1, slp_node, cost_vec, kind);
12366 26320 : return true;
12367 : }
12368 :
12369 : /* Transform. */
12370 :
12371 : /* Handle def. */
12372 8550 : scalar_dest = gimple_assign_lhs (stmt);
12373 8550 : if (reduction_type != EXTRACT_LAST_REDUCTION)
12374 8550 : vec_dest = vect_create_destination_var (scalar_dest, vectype);
12375 :
12376 8550 : bool swap_cond_operands = false;
12377 :
12378 : /* See whether another part of the vectorized code applies a loop
12379 : mask to the condition, or to its inverse. */
12380 :
12381 8550 : vec_loop_masks *masks = NULL;
12382 8550 : vec_loop_lens *lens = NULL;
12383 8550 : if (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
12384 : {
12385 0 : if (reduction_type == EXTRACT_LAST_REDUCTION)
12386 0 : lens = &LOOP_VINFO_LENS (loop_vinfo);
12387 : }
12388 8550 : else if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
12389 : {
12390 3 : if (reduction_type == EXTRACT_LAST_REDUCTION)
12391 0 : masks = &LOOP_VINFO_MASKS (loop_vinfo);
12392 : else
12393 : {
12394 3 : scalar_cond_masked_key cond (cond_expr, 1);
12395 3 : if (loop_vinfo->scalar_cond_masked_set.contains (cond))
12396 0 : masks = &LOOP_VINFO_MASKS (loop_vinfo);
12397 : else
12398 : {
12399 3 : bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
12400 3 : tree_code orig_code = cond.code;
12401 3 : cond.code = invert_tree_comparison (cond.code, honor_nans);
12402 3 : if (!masked && loop_vinfo->scalar_cond_masked_set.contains (cond))
12403 : {
12404 0 : masks = &LOOP_VINFO_MASKS (loop_vinfo);
12405 0 : cond_code = cond.code;
12406 0 : swap_cond_operands = true;
12407 : }
12408 : else
12409 : {
12410 : /* Try the inverse of the current mask. We check if the
12411 : inverse mask is live and if so we generate a negate of
12412 : the current mask such that we still honor NaNs. */
12413 3 : cond.inverted_p = true;
12414 3 : cond.code = orig_code;
12415 3 : if (loop_vinfo->scalar_cond_masked_set.contains (cond))
12416 : {
12417 0 : masks = &LOOP_VINFO_MASKS (loop_vinfo);
12418 0 : cond_code = cond.code;
12419 0 : swap_cond_operands = true;
12420 0 : must_invert_cmp_result = true;
12421 : }
12422 : }
12423 : }
12424 : }
12425 : }
12426 :
12427 : /* Handle cond expr. */
12428 8550 : if (masked)
12429 8550 : vect_get_vec_defs (vinfo, slp_node,
12430 : cond_expr, &vec_oprnds0,
12431 : then_clause, &vec_oprnds2,
12432 : reduction_type != EXTRACT_LAST_REDUCTION
12433 : ? else_clause : NULL, &vec_oprnds3);
12434 : else
12435 0 : vect_get_vec_defs (vinfo, slp_node,
12436 : cond_expr0, &vec_oprnds0,
12437 : cond_expr1, &vec_oprnds1,
12438 : then_clause, &vec_oprnds2,
12439 : reduction_type != EXTRACT_LAST_REDUCTION
12440 : ? else_clause : NULL, &vec_oprnds3);
12441 :
12442 8550 : if (reduction_type == EXTRACT_LAST_REDUCTION)
12443 0 : vec_else_clause = else_clause;
12444 :
12445 : /* Arguments are ready. Create the new vector stmt. */
12446 20098 : FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_cond_lhs)
12447 : {
12448 11548 : vec_then_clause = vec_oprnds2[i];
12449 11548 : if (reduction_type != EXTRACT_LAST_REDUCTION)
12450 11548 : vec_else_clause = vec_oprnds3[i];
12451 :
12452 11548 : if (swap_cond_operands)
12453 0 : std::swap (vec_then_clause, vec_else_clause);
12454 :
12455 11548 : if (masked)
12456 : vec_compare = vec_cond_lhs;
12457 : else
12458 : {
12459 0 : vec_cond_rhs = vec_oprnds1[i];
12460 0 : if (bitop1 == NOP_EXPR)
12461 : {
12462 0 : gimple_seq stmts = NULL;
12463 0 : vec_compare = gimple_build (&stmts, cond_code, vec_cmp_type,
12464 : vec_cond_lhs, vec_cond_rhs);
12465 0 : gsi_insert_before (gsi, stmts, GSI_SAME_STMT);
12466 : }
12467 : else
12468 : {
12469 0 : new_temp = make_ssa_name (vec_cmp_type);
12470 0 : gassign *new_stmt;
12471 0 : if (bitop1 == BIT_NOT_EXPR)
12472 0 : new_stmt = gimple_build_assign (new_temp, bitop1,
12473 : vec_cond_rhs);
12474 : else
12475 0 : new_stmt
12476 0 : = gimple_build_assign (new_temp, bitop1, vec_cond_lhs,
12477 : vec_cond_rhs);
12478 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12479 0 : if (bitop2 == NOP_EXPR)
12480 : vec_compare = new_temp;
12481 0 : else if (bitop2 == BIT_NOT_EXPR
12482 0 : && reduction_type != EXTRACT_LAST_REDUCTION)
12483 : {
12484 : /* Instead of doing ~x ? y : z do x ? z : y. */
12485 : vec_compare = new_temp;
12486 : std::swap (vec_then_clause, vec_else_clause);
12487 : }
12488 : else
12489 : {
12490 0 : vec_compare = make_ssa_name (vec_cmp_type);
12491 0 : if (bitop2 == BIT_NOT_EXPR)
12492 0 : new_stmt
12493 0 : = gimple_build_assign (vec_compare, bitop2, new_temp);
12494 : else
12495 0 : new_stmt
12496 0 : = gimple_build_assign (vec_compare, bitop2,
12497 : vec_cond_lhs, new_temp);
12498 0 : vect_finish_stmt_generation (vinfo, stmt_info,
12499 : new_stmt, gsi);
12500 : }
12501 : }
12502 : }
12503 :
12504 : /* If we decided to apply a loop mask to the result of the vector
12505 : comparison, AND the comparison with the mask now. Later passes
12506 : should then be able to reuse the AND results between multiple
12507 : vector statements.
12508 :
12509 : For example:
12510 : for (int i = 0; i < 100; ++i)
12511 : x[i] = y[i] ? z[i] : 10;
12512 :
12513 : results in following optimized GIMPLE:
12514 :
12515 : mask__35.8_43 = vect__4.7_41 != { 0, ... };
12516 : vec_mask_and_46 = loop_mask_40 & mask__35.8_43;
12517 : _19 = &MEM[base: z_12(D), index: ivtmp_56, step: 4, offset: 0B];
12518 : vect_iftmp.11_47 = .MASK_LOAD (_19, 4B, vec_mask_and_46);
12519 : vect_iftmp.12_52 = VEC_COND_EXPR <vec_mask_and_46,
12520 : vect_iftmp.11_47, { 10, ... }>;
12521 :
12522 : instead of using a masked and unmasked forms of
12523 : vec != { 0, ... } (masked in the MASK_LOAD,
12524 : unmasked in the VEC_COND_EXPR). */
12525 :
12526 : /* Force vec_compare to be an SSA_NAME rather than a comparison,
12527 : in cases where that's necessary. */
12528 :
12529 11548 : tree len = NULL_TREE, bias = NULL_TREE;
12530 11548 : if (masks || lens || reduction_type == EXTRACT_LAST_REDUCTION)
12531 : {
12532 0 : if (!is_gimple_val (vec_compare))
12533 : {
12534 0 : tree vec_compare_name = make_ssa_name (vec_cmp_type);
12535 0 : gassign *new_stmt = gimple_build_assign (vec_compare_name,
12536 : vec_compare);
12537 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12538 0 : vec_compare = vec_compare_name;
12539 : }
12540 :
12541 0 : if (must_invert_cmp_result)
12542 : {
12543 0 : tree vec_compare_name = make_ssa_name (vec_cmp_type);
12544 0 : gassign *new_stmt = gimple_build_assign (vec_compare_name,
12545 : BIT_NOT_EXPR,
12546 : vec_compare);
12547 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12548 0 : vec_compare = vec_compare_name;
12549 : }
12550 :
12551 0 : if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
12552 : vectype, OPTIMIZE_FOR_SPEED))
12553 : {
12554 0 : if (lens)
12555 : {
12556 : /* ??? Do we really want the adjusted LEN here? Isn't this
12557 : based on number of elements? */
12558 0 : len = vect_get_loop_len (loop_vinfo, gsi, lens,
12559 : vec_num, vectype, i, 1, true);
12560 0 : signed char biasval
12561 0 : = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
12562 0 : bias = build_int_cst (intQI_type_node, biasval);
12563 : }
12564 : else
12565 : {
12566 0 : len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
12567 0 : bias = build_int_cst (intQI_type_node, 0);
12568 : }
12569 : }
12570 0 : if (masks)
12571 : {
12572 0 : tree loop_mask
12573 0 : = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num,
12574 : vectype, i);
12575 0 : tree tmp2 = make_ssa_name (vec_cmp_type);
12576 0 : gassign *g
12577 0 : = gimple_build_assign (tmp2, BIT_AND_EXPR, vec_compare,
12578 : loop_mask);
12579 0 : vect_finish_stmt_generation (vinfo, stmt_info, g, gsi);
12580 0 : vec_compare = tmp2;
12581 : }
12582 : }
12583 :
12584 0 : gimple *new_stmt;
12585 0 : if (reduction_type == EXTRACT_LAST_REDUCTION)
12586 : {
12587 0 : gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt;
12588 0 : tree lhs = gimple_get_lhs (old_stmt);
12589 0 : if ((unsigned)i != vec_oprnds0.length () - 1)
12590 0 : lhs = copy_ssa_name (lhs);
12591 0 : if (len)
12592 0 : new_stmt = gimple_build_call_internal
12593 0 : (IFN_LEN_FOLD_EXTRACT_LAST, 5, vec_else_clause, vec_compare,
12594 : vec_then_clause, len, bias);
12595 : else
12596 0 : new_stmt = gimple_build_call_internal
12597 0 : (IFN_FOLD_EXTRACT_LAST, 3, vec_else_clause, vec_compare,
12598 : vec_then_clause);
12599 0 : gimple_call_set_lhs (new_stmt, lhs);
12600 0 : SSA_NAME_DEF_STMT (lhs) = new_stmt;
12601 0 : if ((unsigned)i != vec_oprnds0.length () - 1)
12602 : {
12603 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12604 0 : vec_else_clause = lhs;
12605 : }
12606 0 : else if (old_stmt == gsi_stmt (*gsi))
12607 0 : vect_finish_replace_stmt (vinfo, stmt_info, new_stmt);
12608 : else
12609 : {
12610 : /* In this case we're moving the definition to later in the
12611 : block. That doesn't matter because the only uses of the
12612 : lhs are in phi statements. */
12613 0 : gimple_stmt_iterator old_gsi = gsi_for_stmt (old_stmt);
12614 0 : gsi_remove (&old_gsi, true);
12615 0 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12616 : }
12617 : }
12618 : else
12619 : {
12620 11548 : new_temp = make_ssa_name (vec_dest);
12621 11548 : new_stmt = gimple_build_assign (new_temp, VEC_COND_EXPR, vec_compare,
12622 : vec_then_clause, vec_else_clause);
12623 11548 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12624 : }
12625 11548 : slp_node->push_vec_def (new_stmt);
12626 : }
12627 :
12628 8550 : vec_oprnds0.release ();
12629 8550 : vec_oprnds1.release ();
12630 8550 : vec_oprnds2.release ();
12631 8550 : vec_oprnds3.release ();
12632 :
12633 8550 : return true;
12634 : }
12635 :
12636 : /* Helper of vectorizable_comparison.
12637 :
12638 : Check if STMT_INFO is comparison expression CODE that can be vectorized.
12639 : If COST_VEC is passed, calculate costs but don't change anything,
12640 : otherwise, vectorize STMT_INFO: create a vectorized comparison, and insert
12641 : it at GSI.
12642 :
12643 : Return true if STMT_INFO is vectorizable in this way. */
12644 :
12645 : static bool
12646 352209 : vectorizable_comparison_1 (vec_info *vinfo, tree vectype,
12647 : stmt_vec_info stmt_info, tree_code code,
12648 : gimple_stmt_iterator *gsi,
12649 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
12650 : {
12651 352209 : tree lhs, rhs1, rhs2;
12652 352209 : tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
12653 352209 : tree vec_rhs1 = NULL_TREE, vec_rhs2 = NULL_TREE;
12654 352209 : tree new_temp;
12655 352209 : enum vect_def_type dts[2] = {vect_unknown_def_type, vect_unknown_def_type};
12656 352209 : poly_uint64 nunits;
12657 352209 : enum tree_code bitop1 = NOP_EXPR, bitop2 = NOP_EXPR;
12658 352209 : int i;
12659 352209 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
12660 352209 : vec<tree> vec_oprnds0 = vNULL;
12661 352209 : vec<tree> vec_oprnds1 = vNULL;
12662 352209 : tree mask_type;
12663 352209 : tree mask = NULL_TREE;
12664 :
12665 352209 : if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
12666 : return false;
12667 :
12668 352209 : if (!vectype || !VECTOR_BOOLEAN_TYPE_P (vectype))
12669 : return false;
12670 :
12671 159826 : mask_type = vectype;
12672 159826 : nunits = TYPE_VECTOR_SUBPARTS (vectype);
12673 :
12674 159826 : if (TREE_CODE_CLASS (code) != tcc_comparison)
12675 : return false;
12676 :
12677 158068 : slp_tree slp_rhs1, slp_rhs2;
12678 158068 : if (!vect_is_simple_use (vinfo, slp_node,
12679 : 0, &rhs1, &slp_rhs1, &dts[0], &vectype1))
12680 : return false;
12681 :
12682 158068 : if (!vect_is_simple_use (vinfo, slp_node,
12683 : 1, &rhs2, &slp_rhs2, &dts[1], &vectype2))
12684 : return false;
12685 :
12686 122812 : if (vectype1 && vectype2
12687 231035 : && maybe_ne (TYPE_VECTOR_SUBPARTS (vectype1),
12688 72967 : TYPE_VECTOR_SUBPARTS (vectype2)))
12689 16 : return false;
12690 :
12691 158052 : vectype = vectype1 ? vectype1 : vectype2;
12692 :
12693 : /* Invariant comparison. */
12694 158052 : if (!vectype)
12695 : {
12696 30407 : vectype = get_vectype_for_scalar_type (vinfo, TREE_TYPE (rhs1), slp_node);
12697 30407 : if (!vectype || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype), nunits))
12698 7 : return false;
12699 : }
12700 127645 : else if (maybe_ne (nunits, TYPE_VECTOR_SUBPARTS (vectype)))
12701 : return false;
12702 :
12703 : /* Can't compare mask and non-mask types. */
12704 122796 : if (vectype1 && vectype2
12705 376354 : && (VECTOR_BOOLEAN_TYPE_P (vectype1) ^ VECTOR_BOOLEAN_TYPE_P (vectype2)))
12706 : return false;
12707 :
12708 : /* Boolean values may have another representation in vectors
12709 : and therefore we prefer bit operations over comparison for
12710 : them (which also works for scalar masks). We store opcodes
12711 : to use in bitop1 and bitop2. Statement is vectorized as
12712 : BITOP2 (rhs1 BITOP1 rhs2) or
12713 : rhs1 BITOP2 (BITOP1 rhs2)
12714 : depending on bitop1 and bitop2 arity. */
12715 158037 : bool swap_p = false;
12716 158037 : if (VECTOR_BOOLEAN_TYPE_P (vectype))
12717 : {
12718 654 : if (code == GT_EXPR)
12719 : {
12720 : bitop1 = BIT_NOT_EXPR;
12721 : bitop2 = BIT_AND_EXPR;
12722 : }
12723 : else if (code == GE_EXPR)
12724 : {
12725 : bitop1 = BIT_NOT_EXPR;
12726 : bitop2 = BIT_IOR_EXPR;
12727 : }
12728 : else if (code == LT_EXPR)
12729 : {
12730 : bitop1 = BIT_NOT_EXPR;
12731 : bitop2 = BIT_AND_EXPR;
12732 : swap_p = true;
12733 : }
12734 : else if (code == LE_EXPR)
12735 : {
12736 : bitop1 = BIT_NOT_EXPR;
12737 : bitop2 = BIT_IOR_EXPR;
12738 : swap_p = true;
12739 : }
12740 : else
12741 : {
12742 : bitop1 = BIT_XOR_EXPR;
12743 : if (code == EQ_EXPR)
12744 : bitop2 = BIT_NOT_EXPR;
12745 : }
12746 : }
12747 :
12748 158037 : if (cost_vec)
12749 : {
12750 145536 : if (bitop1 == NOP_EXPR)
12751 : {
12752 145014 : if (!expand_vec_cmp_expr_p (vectype, mask_type, code))
12753 : return false;
12754 : }
12755 : else
12756 : {
12757 522 : machine_mode mode = TYPE_MODE (vectype);
12758 522 : optab optab;
12759 :
12760 522 : optab = optab_for_tree_code (bitop1, vectype, optab_default);
12761 522 : if (!optab || !can_implement_p (optab, mode))
12762 0 : return false;
12763 :
12764 522 : if (bitop2 != NOP_EXPR)
12765 : {
12766 91 : optab = optab_for_tree_code (bitop2, vectype, optab_default);
12767 91 : if (!optab || !can_implement_p (optab, mode))
12768 0 : return false;
12769 : }
12770 : }
12771 :
12772 : /* Put types on constant and invariant SLP children. */
12773 137571 : if (!vect_maybe_update_slp_op_vectype (slp_rhs1, vectype)
12774 137571 : || !vect_maybe_update_slp_op_vectype (slp_rhs2, vectype))
12775 : {
12776 2 : if (dump_enabled_p ())
12777 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12778 : "incompatible vector types for invariants\n");
12779 2 : return false;
12780 : }
12781 :
12782 137569 : vect_model_simple_cost (vinfo, 1 + (bitop2 != NOP_EXPR),
12783 : slp_node, cost_vec);
12784 137569 : return true;
12785 : }
12786 :
12787 : /* Transform. */
12788 :
12789 : /* Handle def. */
12790 12501 : lhs = gimple_get_lhs (STMT_VINFO_STMT (stmt_info));
12791 12501 : if (lhs)
12792 12501 : mask = vect_create_destination_var (lhs, mask_type);
12793 :
12794 12501 : vect_get_vec_defs (vinfo, slp_node, rhs1, &vec_oprnds0, rhs2, &vec_oprnds1);
12795 12501 : if (swap_p)
12796 58 : std::swap (vec_oprnds0, vec_oprnds1);
12797 :
12798 : /* Arguments are ready. Create the new vector stmt. */
12799 31506 : FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_rhs1)
12800 : {
12801 19005 : gimple *new_stmt;
12802 19005 : vec_rhs2 = vec_oprnds1[i];
12803 :
12804 19005 : if (lhs)
12805 19005 : new_temp = make_ssa_name (mask);
12806 : else
12807 0 : new_temp = make_temp_ssa_name (mask_type, NULL, "cmp");
12808 19005 : if (bitop1 == NOP_EXPR)
12809 : {
12810 18863 : new_stmt = gimple_build_assign (new_temp, code,
12811 : vec_rhs1, vec_rhs2);
12812 18863 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12813 : }
12814 : else
12815 : {
12816 142 : if (bitop1 == BIT_NOT_EXPR)
12817 84 : new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs2);
12818 : else
12819 58 : new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs1,
12820 : vec_rhs2);
12821 142 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12822 142 : if (bitop2 != NOP_EXPR)
12823 : {
12824 84 : tree res = make_ssa_name (mask);
12825 84 : if (bitop2 == BIT_NOT_EXPR)
12826 0 : new_stmt = gimple_build_assign (res, bitop2, new_temp);
12827 : else
12828 84 : new_stmt = gimple_build_assign (res, bitop2, vec_rhs1,
12829 : new_temp);
12830 84 : vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
12831 : }
12832 : }
12833 19005 : slp_node->push_vec_def (new_stmt);
12834 : }
12835 :
12836 12501 : vec_oprnds0.release ();
12837 12501 : vec_oprnds1.release ();
12838 :
12839 12501 : return true;
12840 : }
12841 :
12842 : /* vectorizable_comparison.
12843 :
12844 : Check if STMT_INFO is comparison expression that can be vectorized.
12845 : If COST_VEC is passed, calculate costs but don't change anything,
12846 : otherwise, vectorize STMT_INFO: create a vectorized comparison, and insert
12847 : it at GSI.
12848 :
12849 : Return true if STMT_INFO is vectorizable in this way. */
12850 :
12851 : static bool
12852 653914 : vectorizable_comparison (vec_info *vinfo,
12853 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
12854 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
12855 : {
12856 653914 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
12857 :
12858 653914 : if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
12859 : return false;
12860 :
12861 653914 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
12862 : return false;
12863 :
12864 856053 : gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
12865 349605 : if (!stmt)
12866 : return false;
12867 :
12868 349605 : enum tree_code code = gimple_assign_rhs_code (stmt);
12869 349605 : tree vectype = SLP_TREE_VECTYPE (slp_node);
12870 349605 : if (!vectorizable_comparison_1 (vinfo, vectype, stmt_info, code, gsi,
12871 : slp_node, cost_vec))
12872 : return false;
12873 :
12874 147466 : if (cost_vec)
12875 134965 : SLP_TREE_TYPE (slp_node) = comparison_vec_info_type;
12876 :
12877 : return true;
12878 : }
12879 :
12880 : /* Check to see if the target supports any of the compare and branch optabs for
12881 : vectors with MODE as these would be required when expanding. */
12882 : static bool
12883 61309 : supports_vector_compare_and_branch (loop_vec_info loop_vinfo, machine_mode mode)
12884 : {
12885 61309 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
12886 61309 : bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
12887 :
12888 : /* The vectorizer only produces vec_cbranch_any_optab directly. So only
12889 : check for support for that or vec_cbranch_any_optab when masked.
12890 : We can't produce vcond_cbranch_any directly from the vectorizer as we
12891 : want to keep gimple_cond as the GIMPLE representation. But we'll fold
12892 : it in expand. For that reason we require a backend to support the
12893 : unconditional vector cbranch optab if they support the conditional one,
12894 : which is just an optimization on the unconditional one. */
12895 61309 : if (masked_loop_p
12896 61309 : && direct_optab_handler (cond_vec_cbranch_any_optab, mode)
12897 : != CODE_FOR_nothing)
12898 : return true;
12899 61309 : else if (len_loop_p
12900 61309 : && direct_optab_handler (cond_len_vec_cbranch_any_optab, mode)
12901 : != CODE_FOR_nothing)
12902 : return true;
12903 61309 : else if (!masked_loop_p && !len_loop_p
12904 122618 : && direct_optab_handler (vec_cbranch_any_optab, mode)
12905 : != CODE_FOR_nothing)
12906 : return true;
12907 :
12908 : /* The target can implement cbranch to distinguish between boolean vector
12909 : types and data types if they don't have a different mode for both. */
12910 61309 : return direct_optab_handler (cbranch_optab, mode) != CODE_FOR_nothing;
12911 : }
12912 :
12913 : /* Determine the type to use for early break vectorization's scalar IV. If
12914 : no type is possible return false. */
12915 :
12916 : static bool
12917 2604 : vect_compute_type_for_early_break_scalar_iv (loop_vec_info loop_vinfo)
12918 : {
12919 : /* Check if we have a usable scalar IV type for vectorization. */
12920 2604 : tree iters_vf_type = sizetype;
12921 2604 : if (!LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo))
12922 : {
12923 : /* Find the type with the minimum precision we can use
12924 : for the scalar IV. */
12925 2381 : tree cand_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
12926 :
12927 : /* Work out how many bits we need to represent the limit. */
12928 2381 : unsigned int min_ni_width
12929 2381 : = vect_min_prec_for_max_niters (loop_vinfo, 1);
12930 :
12931 : /* Check if we're using PFA, if so we need a signed IV and an
12932 : extra bit for the sign. */
12933 2381 : if (TYPE_UNSIGNED (cand_type)
12934 2381 : && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
12935 3925 : && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12936 156 : min_ni_width += 1;
12937 :
12938 2381 : if (TYPE_PRECISION (cand_type) >= min_ni_width)
12939 2308 : iters_vf_type = unsigned_type_for (cand_type);
12940 : else
12941 : {
12942 73 : opt_scalar_int_mode cmp_mode_iter;
12943 73 : tree iv_type = NULL_TREE;
12944 357 : FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
12945 : {
12946 357 : auto cmp_mode = cmp_mode_iter.require ();
12947 357 : unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode);
12948 357 : if (cmp_bits >= min_ni_width
12949 357 : && targetm.scalar_mode_supported_p (cmp_mode))
12950 : {
12951 73 : iv_type = build_nonstandard_integer_type (cmp_bits, true);
12952 73 : if (iv_type)
12953 : break;
12954 : }
12955 : }
12956 :
12957 73 : if (!iv_type)
12958 : {
12959 0 : if (dump_enabled_p ())
12960 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
12961 : "can't vectorize early exit because the "
12962 : "target doesn't support a scalar type wide "
12963 : "wide enough to hold niters.\n");
12964 0 : return false;
12965 : }
12966 73 : iters_vf_type = iv_type;
12967 : }
12968 : }
12969 :
12970 2604 : LOOP_VINFO_EARLY_BRK_IV_TYPE (loop_vinfo) = iters_vf_type;
12971 2604 : return true;
12972 : }
12973 :
12974 : /* Check to see if the current early break given in STMT_INFO is valid for
12975 : vectorization. */
12976 :
12977 : bool
12978 243630 : vectorizable_early_exit (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
12979 : gimple_stmt_iterator *gsi,
12980 : slp_tree slp_node, stmt_vector_for_cost *cost_vec)
12981 : {
12982 243630 : if (!is_a <gcond *> (STMT_VINFO_STMT (stmt_info)))
12983 : return false;
12984 :
12985 62879 : if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_condition_def)
12986 : return false;
12987 :
12988 62879 : if (!STMT_VINFO_RELEVANT_P (stmt_info))
12989 : return false;
12990 :
12991 62879 : DUMP_VECT_SCOPE ("vectorizable_early_exit");
12992 :
12993 62879 : auto code = gimple_cond_code (STMT_VINFO_STMT (stmt_info));
12994 :
12995 : /* For SLP we don't want to use the type of the operands of the SLP node, when
12996 : vectorizing using SLP slp_node will be the children of the gcond and we
12997 : want to use the type of the direct children which since the gcond is root
12998 : will be the current node, rather than a child node as vect_is_simple_use
12999 : assumes. */
13000 62879 : tree vectype = SLP_TREE_VECTYPE (slp_node);
13001 62879 : if (!vectype)
13002 : return false;
13003 :
13004 62879 : machine_mode mode = TYPE_MODE (vectype);
13005 62879 : int vec_num = vect_get_num_copies (loop_vinfo, slp_node);
13006 :
13007 62879 : vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
13008 62879 : vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
13009 62879 : bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
13010 62879 : bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
13011 :
13012 : /* Now build the new conditional. Pattern gimple_conds get dropped during
13013 : codegen so we must replace the original insn. */
13014 62879 : gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info));
13015 62879 : gcond *cond_stmt = as_a <gcond *>(orig_stmt);
13016 :
13017 62879 : tree vectype_out = vectype;
13018 62879 : auto bb = gimple_bb (cond_stmt);
13019 62879 : edge exit_true_edge = EDGE_SUCC (bb, 0);
13020 62879 : if (exit_true_edge->flags & EDGE_FALSE_VALUE)
13021 660 : exit_true_edge = EDGE_SUCC (bb, 1);
13022 62879 : gcc_assert (exit_true_edge->flags & EDGE_TRUE_VALUE);
13023 :
13024 : /* When vectorizing we assume that if the branch edge is taken that we're
13025 : exiting the loop. This is not however always the case as the compiler will
13026 : rewrite conditions to always be a comparison against 0. To do this it
13027 : sometimes flips the edges. This is fine for scalar, but for vector we
13028 : then have to negate the result of the test, as we're still assuming that if
13029 : you take the branch edge that we found the exit condition. i.e. we need to
13030 : know whether we are generating a `forall` or an `exist` condition. */
13031 125758 : bool flipped = flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
13032 62879 : exit_true_edge->dest);
13033 :
13034 : /* See if we support ADDHN and use that for the reduction. */
13035 62879 : internal_fn ifn = IFN_VEC_TRUNC_ADD_HIGH;
13036 62879 : bool addhn_supported_p
13037 62879 : = direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_BOTH);
13038 62879 : tree narrow_type = NULL_TREE;
13039 62879 : if (addhn_supported_p)
13040 : {
13041 : /* Calculate the narrowing type for the result. */
13042 0 : auto halfprec = TYPE_PRECISION (TREE_TYPE (vectype)) / 2;
13043 0 : auto unsignedp = TYPE_UNSIGNED (TREE_TYPE (vectype));
13044 0 : tree itype = build_nonstandard_integer_type (halfprec, unsignedp);
13045 0 : tree tmp_type = build_vector_type (itype, TYPE_VECTOR_SUBPARTS (vectype));
13046 0 : narrow_type = truth_type_for (tmp_type);
13047 :
13048 0 : if (!supports_vector_compare_and_branch (loop_vinfo,
13049 0 : TYPE_MODE (narrow_type)))
13050 : {
13051 0 : if (dump_enabled_p ())
13052 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
13053 : "can't use ADDHN reduction because cbranch for "
13054 : "the narrowed type is not supported by the "
13055 : "target.\n");
13056 : addhn_supported_p = false;
13057 : }
13058 : }
13059 :
13060 : /* Analyze only. */
13061 62879 : if (cost_vec)
13062 : {
13063 61309 : if (!addhn_supported_p
13064 61309 : && !supports_vector_compare_and_branch (loop_vinfo, mode))
13065 : {
13066 58705 : if (dump_enabled_p ())
13067 597 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
13068 : "can't vectorize early exit because the "
13069 : "target doesn't support flag setting vector "
13070 : "comparisons.\n");
13071 58705 : return false;
13072 : }
13073 :
13074 2604 : if (!vectorizable_comparison_1 (loop_vinfo, vectype, stmt_info, code, gsi,
13075 : slp_node, cost_vec))
13076 : return false;
13077 :
13078 2604 : if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
13079 : {
13080 1544 : if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype,
13081 : OPTIMIZE_FOR_SPEED))
13082 0 : vect_record_loop_len (loop_vinfo, lens, vec_num, vectype, 1);
13083 : else
13084 1544 : vect_record_loop_mask (loop_vinfo, masks, vec_num, vectype, NULL);
13085 : }
13086 :
13087 2604 : if (!vect_compute_type_for_early_break_scalar_iv (loop_vinfo))
13088 : return false;
13089 :
13090 : return true;
13091 : }
13092 :
13093 : /* Transform. */
13094 :
13095 1570 : tree new_temp = NULL_TREE;
13096 1570 : gimple *new_stmt = NULL;
13097 :
13098 1570 : if (dump_enabled_p ())
13099 405 : dump_printf_loc (MSG_NOTE, vect_location, "transform early-exit.\n");
13100 :
13101 : /* For SLP we don't do codegen of the body starting from the gcond, the gconds are
13102 : roots and so by the time we get to them we have already codegened the SLP tree
13103 : and so we shouldn't try to do so again. The arguments have already been
13104 : vectorized. It's not very clean to do this here, But the masking code below is
13105 : complex and this keeps it all in one place to ease fixes and backports. Once we
13106 : drop the non-SLP loop vect or split vectorizable_* this can be simplified. */
13107 :
13108 1570 : gimple *stmt = STMT_VINFO_STMT (stmt_info);
13109 1570 : basic_block cond_bb = gimple_bb (stmt);
13110 1570 : gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
13111 :
13112 1570 : auto_vec<tree> stmts;
13113 1570 : stmts.safe_splice (SLP_TREE_VEC_DEFS (slp_node));
13114 :
13115 : /* If we're comparing against a previous forall we need to negate the results
13116 : before we do the final comparison or reduction. */
13117 1570 : if (flipped)
13118 : {
13119 : /* Rewrite the if(all(mask)) into if (!all(mask)) which is the same as
13120 : if (any(~mask)) by negating the masks and flipping the branches.
13121 :
13122 : 1. For unmasked loops we simply reduce the ~mask.
13123 : 2. For masked loops we reduce (~mask & loop_mask) which is the same as
13124 : doing (mask & loop_mask) ^ loop_mask. */
13125 294 : for (unsigned i = 0; i < stmts.length (); i++)
13126 : {
13127 173 : tree inv_lhs = make_temp_ssa_name (vectype, NULL, "vexit_inv");
13128 173 : auto inv_stmt = gimple_build_assign (inv_lhs, BIT_NOT_EXPR, stmts[i]);
13129 173 : vect_finish_stmt_generation (loop_vinfo, stmt_info, inv_stmt,
13130 : &cond_gsi);
13131 173 : stmts[i] = inv_lhs;
13132 : }
13133 :
13134 121 : EDGE_SUCC (bb, 0)->flags ^= (EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
13135 121 : EDGE_SUCC (bb, 1)->flags ^= (EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
13136 : }
13137 :
13138 : /* Determine if we need to reduce the final value. */
13139 1570 : if (stmts.length () > 1)
13140 : {
13141 : /* We build the reductions in a way to maintain as much parallelism as
13142 : possible. */
13143 141 : auto_vec<tree> workset (stmts.length ());
13144 :
13145 : /* Mask the statements as we queue them up. Normally we loop over
13146 : vec_num, but since we inspect the exact results of vectorization
13147 : we don't need to and instead can just use the stmts themselves. */
13148 141 : if (masked_loop_p)
13149 0 : for (unsigned i = 0; i < stmts.length (); i++)
13150 : {
13151 0 : tree stmt_mask
13152 0 : = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num,
13153 : vectype, i);
13154 0 : stmt_mask
13155 0 : = prepare_vec_mask (loop_vinfo, TREE_TYPE (stmt_mask), stmt_mask,
13156 0 : stmts[i], &cond_gsi);
13157 0 : workset.quick_push (stmt_mask);
13158 : }
13159 141 : else if (len_loop_p)
13160 0 : for (unsigned i = 0; i < stmts.length (); i++)
13161 : {
13162 0 : tree len_mask = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi,
13163 : lens, vec_num,
13164 0 : vectype, stmts[i], i, 1);
13165 :
13166 0 : workset.quick_push (len_mask);
13167 : }
13168 : else
13169 141 : workset.splice (stmts);
13170 :
13171 430 : while (workset.length () > 1)
13172 : {
13173 289 : tree arg0 = workset.pop ();
13174 289 : tree arg1 = workset.pop ();
13175 289 : if (addhn_supported_p && workset.length () == 0)
13176 : {
13177 0 : new_stmt = gimple_build_call_internal (ifn, 2, arg0, arg1);
13178 0 : vectype_out = narrow_type;
13179 0 : new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc");
13180 0 : gimple_call_set_lhs (as_a <gcall *> (new_stmt), new_temp);
13181 0 : gimple_call_set_nothrow (as_a <gcall *> (new_stmt), true);
13182 : }
13183 : else
13184 : {
13185 289 : new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc");
13186 289 : new_stmt
13187 289 : = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1);
13188 : }
13189 289 : vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
13190 : &cond_gsi);
13191 289 : workset.quick_insert (0, new_temp);
13192 : }
13193 141 : }
13194 : else
13195 : {
13196 1429 : new_temp = stmts[0];
13197 1429 : if (masked_loop_p)
13198 : {
13199 0 : tree mask
13200 0 : = vect_get_loop_mask (loop_vinfo, gsi, masks, 1, vectype, 0);
13201 0 : new_temp = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
13202 : new_temp, &cond_gsi);
13203 : }
13204 1429 : else if (len_loop_p)
13205 0 : new_temp = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi, lens,
13206 : 1, vectype, new_temp, 0, 1);
13207 : }
13208 :
13209 1570 : gcc_assert (new_temp);
13210 :
13211 1570 : tree cst = build_zero_cst (vectype_out);
13212 1570 : gimple_cond_set_condition (cond_stmt, NE_EXPR, new_temp, cst);
13213 1570 : update_stmt (orig_stmt);
13214 :
13215 : /* ??? */
13216 1570 : SLP_TREE_VEC_DEFS (slp_node).truncate (0);
13217 :
13218 1570 : return true;
13219 1570 : }
13220 :
13221 : /* If SLP_NODE is nonnull, return true if vectorizable_live_operation
13222 : can handle all live statements in the node. Otherwise return true
13223 : if STMT_INFO is not live or if vectorizable_live_operation can handle it.
13224 : VEC_STMT_P is as for vectorizable_live_operation. */
13225 :
13226 : static bool
13227 1292265 : can_vectorize_live_stmts (vec_info *vinfo,
13228 : slp_tree slp_node, slp_instance slp_node_instance,
13229 : bool vec_stmt_p,
13230 : stmt_vector_for_cost *cost_vec)
13231 : {
13232 1292265 : stmt_vec_info slp_stmt_info;
13233 1292265 : unsigned int i;
13234 2725559 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, slp_stmt_info)
13235 : {
13236 1433294 : if (slp_stmt_info
13237 1417200 : && STMT_VINFO_LIVE_P (slp_stmt_info)
13238 1568414 : && !vectorizable_live_operation (vinfo, slp_stmt_info, slp_node,
13239 : slp_node_instance, i,
13240 : vec_stmt_p, cost_vec))
13241 : return false;
13242 : }
13243 :
13244 : return true;
13245 : }
13246 :
13247 : /* Make sure the statement is vectorizable. */
13248 :
13249 : opt_result
13250 2663522 : vect_analyze_stmt (vec_info *vinfo,
13251 : slp_tree node, slp_instance node_instance,
13252 : stmt_vector_for_cost *cost_vec)
13253 : {
13254 2663522 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
13255 2663522 : bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
13256 2663522 : enum vect_relevant relevance = STMT_VINFO_RELEVANT (stmt_info);
13257 2663522 : bool ok;
13258 :
13259 2663522 : if (dump_enabled_p ())
13260 100596 : dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
13261 : stmt_info->stmt);
13262 :
13263 5041511 : if (gimple_has_volatile_ops (stmt_info->stmt))
13264 : {
13265 : /* ??? This shouldn't really happen, volatile stmts should
13266 : not end up in the SLP graph. */
13267 0 : return opt_result::failure_at (stmt_info->stmt,
13268 : "not vectorized:"
13269 : " stmt has volatile operands: %G\n",
13270 : stmt_info->stmt);
13271 : }
13272 :
13273 : /* Skip stmts that do not need to be vectorized. */
13274 2663522 : if (!STMT_VINFO_RELEVANT_P (stmt_info)
13275 0 : && !STMT_VINFO_LIVE_P (stmt_info))
13276 : {
13277 0 : if (dump_enabled_p ())
13278 0 : dump_printf_loc (MSG_NOTE, vect_location, "irrelevant.\n");
13279 :
13280 : /* ??? This shouldn't really happen, irrelevant stmts should
13281 : not end up in the SLP graph. */
13282 0 : return opt_result::failure_at (stmt_info->stmt,
13283 : "not vectorized:"
13284 : " irrelevant stmt as SLP node %p "
13285 : "representative.\n",
13286 : (void *)node);
13287 : }
13288 :
13289 2663522 : switch (STMT_VINFO_DEF_TYPE (stmt_info))
13290 : {
13291 : case vect_internal_def:
13292 : case vect_condition_def:
13293 : break;
13294 :
13295 84163 : case vect_reduction_def:
13296 84163 : case vect_nested_cycle:
13297 84163 : gcc_assert (!bb_vinfo
13298 : && (relevance == vect_used_in_outer
13299 : || relevance == vect_used_in_outer_by_reduction
13300 : || relevance == vect_used_by_reduction
13301 : || relevance == vect_unused_in_scope
13302 : || relevance == vect_used_only_live));
13303 : break;
13304 :
13305 322 : case vect_double_reduction_def:
13306 322 : gcc_assert (!bb_vinfo && node);
13307 : break;
13308 :
13309 150192 : case vect_induction_def:
13310 150192 : case vect_first_order_recurrence:
13311 150192 : gcc_assert (!bb_vinfo);
13312 : break;
13313 :
13314 0 : case vect_constant_def:
13315 0 : case vect_external_def:
13316 0 : case vect_unknown_def_type:
13317 0 : default:
13318 0 : gcc_unreachable ();
13319 : }
13320 :
13321 2663522 : tree saved_vectype = STMT_VINFO_VECTYPE (stmt_info);
13322 2663522 : STMT_VINFO_VECTYPE (stmt_info) = NULL_TREE;
13323 :
13324 2663522 : if (STMT_VINFO_RELEVANT_P (stmt_info))
13325 : {
13326 2663522 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
13327 2663522 : gcc_assert (SLP_TREE_VECTYPE (node)
13328 : || gimple_code (stmt_info->stmt) == GIMPLE_COND
13329 : || (call && gimple_call_lhs (call) == NULL_TREE));
13330 : }
13331 :
13332 2663522 : ok = true;
13333 2663522 : if (bb_vinfo
13334 1473016 : || (STMT_VINFO_RELEVANT_P (stmt_info)
13335 0 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def))
13336 : /* Prefer vectorizable_call over vectorizable_simd_clone_call so
13337 : -mveclibabi= takes preference over library functions with
13338 : the simd attribute. */
13339 2663522 : ok = (vectorizable_call (vinfo, stmt_info, NULL, node, cost_vec)
13340 2656638 : || vectorizable_simd_clone_call (vinfo, stmt_info, NULL, node,
13341 : cost_vec)
13342 2656171 : || vectorizable_conversion (vinfo, stmt_info, NULL, node, cost_vec)
13343 2572838 : || vectorizable_operation (vinfo, stmt_info, NULL, node, cost_vec)
13344 2036019 : || vectorizable_assignment (vinfo, stmt_info, NULL, node, cost_vec)
13345 1966846 : || vectorizable_load (vinfo, stmt_info, NULL, node, cost_vec)
13346 1532013 : || vectorizable_store (vinfo, stmt_info, NULL, node, cost_vec)
13347 717488 : || vectorizable_shift (vinfo, stmt_info, NULL, node, cost_vec)
13348 667733 : || vectorizable_condition (vinfo, stmt_info, NULL, node, cost_vec)
13349 641413 : || vectorizable_comparison (vinfo, stmt_info, NULL, node, cost_vec)
13350 506448 : || (bb_vinfo
13351 124242 : && vectorizable_phi (bb_vinfo, stmt_info, node, cost_vec))
13352 3112047 : || (is_a <loop_vec_info> (vinfo)
13353 382206 : && (vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
13354 : stmt_info, node, cost_vec)
13355 381490 : || vectorizable_reduction (as_a <loop_vec_info> (vinfo),
13356 : stmt_info,
13357 : node, node_instance, cost_vec)
13358 299868 : || vectorizable_induction (as_a <loop_vec_info> (vinfo),
13359 : stmt_info, node, cost_vec)
13360 181833 : || vectorizable_lc_phi (as_a <loop_vec_info> (vinfo),
13361 : stmt_info, node)
13362 181012 : || vectorizable_recurr (as_a <loop_vec_info> (vinfo),
13363 : stmt_info, node, cost_vec)
13364 180751 : || vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
13365 : stmt_info, NULL, node,
13366 : cost_vec))));
13367 :
13368 2663522 : STMT_VINFO_VECTYPE (stmt_info) = saved_vectype;
13369 :
13370 2416452 : if (!ok)
13371 247070 : return opt_result::failure_at (stmt_info->stmt,
13372 : "not vectorized:"
13373 : " relevant stmt not supported: %G",
13374 : stmt_info->stmt);
13375 :
13376 : /* Stmts that are (also) "live" (i.e. - that are used out of the loop)
13377 : need extra handling, except for vectorizable reductions. */
13378 2416452 : if (!bb_vinfo
13379 1292265 : && (SLP_TREE_TYPE (node) != lc_phi_info_type
13380 821 : || SLP_TREE_DEF_TYPE (node) == vect_internal_def)
13381 1292265 : && (!node->ldst_lanes || SLP_TREE_PERMUTE_P (node))
13382 3708717 : && !can_vectorize_live_stmts (as_a <loop_vec_info> (vinfo),
13383 : node, node_instance,
13384 : false, cost_vec))
13385 0 : return opt_result::failure_at (stmt_info->stmt,
13386 : "not vectorized:"
13387 : " live stmt not supported: %G",
13388 : stmt_info->stmt);
13389 :
13390 2416452 : return opt_result::success ();
13391 : }
13392 :
13393 :
13394 : /* Function vect_transform_stmt.
13395 :
13396 : Create a vectorized stmt to replace STMT_INFO, and insert it at GSI. */
13397 :
13398 : bool
13399 973797 : vect_transform_stmt (vec_info *vinfo,
13400 : stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
13401 : slp_tree slp_node, slp_instance slp_node_instance)
13402 : {
13403 973797 : bool is_store = false;
13404 973797 : bool done;
13405 :
13406 973797 : gcc_assert (slp_node);
13407 :
13408 973797 : if (stmt_info)
13409 972960 : STMT_VINFO_VECTYPE (stmt_info) = NULL_TREE;
13410 :
13411 973797 : switch (SLP_TREE_TYPE (slp_node))
13412 : {
13413 22892 : case type_demotion_vec_info_type:
13414 22892 : case type_promotion_vec_info_type:
13415 22892 : case type_conversion_vec_info_type:
13416 22892 : done = vectorizable_conversion (vinfo, stmt_info, gsi, slp_node, NULL);
13417 22892 : gcc_assert (done);
13418 : break;
13419 :
13420 16311 : case induc_vec_info_type:
13421 16311 : done = vectorizable_induction (as_a <loop_vec_info> (vinfo),
13422 : stmt_info, slp_node, NULL);
13423 16311 : gcc_assert (done);
13424 : break;
13425 :
13426 8560 : case shift_vec_info_type:
13427 8560 : done = vectorizable_shift (vinfo, stmt_info, gsi, slp_node, NULL);
13428 8560 : gcc_assert (done);
13429 : break;
13430 :
13431 114588 : case op_vec_info_type:
13432 114588 : done = vectorizable_operation (vinfo, stmt_info, gsi, slp_node, NULL);
13433 114588 : gcc_assert (done);
13434 : break;
13435 :
13436 16024 : case assignment_vec_info_type:
13437 16024 : done = vectorizable_assignment (vinfo, stmt_info, gsi, slp_node, NULL);
13438 16024 : gcc_assert (done);
13439 : break;
13440 :
13441 166677 : case load_vec_info_type:
13442 166677 : done = vectorizable_load (vinfo, stmt_info, gsi, slp_node, NULL);
13443 166677 : gcc_assert (done);
13444 : break;
13445 :
13446 545722 : case store_vec_info_type:
13447 545722 : done = vectorizable_store (vinfo, stmt_info, gsi, slp_node, NULL);
13448 545722 : gcc_assert (done);
13449 : is_store = true;
13450 : break;
13451 :
13452 8550 : case condition_vec_info_type:
13453 8550 : done = vectorizable_condition (vinfo, stmt_info, gsi, slp_node, NULL);
13454 8550 : gcc_assert (done);
13455 : break;
13456 :
13457 12501 : case comparison_vec_info_type:
13458 12501 : done = vectorizable_comparison (vinfo, stmt_info, gsi, slp_node, NULL);
13459 12501 : gcc_assert (done);
13460 : break;
13461 :
13462 4193 : case call_vec_info_type:
13463 4193 : done = vectorizable_call (vinfo, stmt_info, gsi, slp_node, NULL);
13464 4193 : break;
13465 :
13466 362 : case call_simd_clone_vec_info_type:
13467 362 : done = vectorizable_simd_clone_call (vinfo, stmt_info, gsi,
13468 : slp_node, NULL);
13469 362 : break;
13470 :
13471 2636 : case reduc_vec_info_type:
13472 2636 : done = vect_transform_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
13473 : gsi, slp_node);
13474 2636 : gcc_assert (done);
13475 : break;
13476 :
13477 23727 : case cycle_phi_info_type:
13478 23727 : done = vect_transform_cycle_phi (as_a <loop_vec_info> (vinfo), stmt_info,
13479 : slp_node, slp_node_instance);
13480 23727 : gcc_assert (done);
13481 : break;
13482 :
13483 530 : case lc_phi_info_type:
13484 530 : done = vect_transform_lc_phi (as_a <loop_vec_info> (vinfo),
13485 : stmt_info, slp_node);
13486 530 : gcc_assert (done);
13487 : break;
13488 :
13489 45 : case recurr_info_type:
13490 45 : done = vectorizable_recurr (as_a <loop_vec_info> (vinfo),
13491 : stmt_info, slp_node, NULL);
13492 45 : gcc_assert (done);
13493 : break;
13494 :
13495 14146 : case phi_info_type:
13496 14146 : done = vectorizable_phi (as_a <bb_vec_info> (vinfo),
13497 : stmt_info, slp_node, NULL);
13498 14146 : gcc_assert (done);
13499 : break;
13500 :
13501 0 : case loop_exit_ctrl_vec_info_type:
13502 0 : done = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
13503 : stmt_info, gsi, slp_node, NULL);
13504 0 : gcc_assert (done);
13505 : break;
13506 :
13507 16333 : case permute_info_type:
13508 16333 : done = vectorizable_slp_permutation (vinfo, gsi, slp_node, NULL);
13509 16333 : gcc_assert (done);
13510 : break;
13511 :
13512 0 : default:
13513 0 : if (!STMT_VINFO_LIVE_P (stmt_info))
13514 : {
13515 0 : if (dump_enabled_p ())
13516 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
13517 : "stmt not supported.\n");
13518 0 : gcc_unreachable ();
13519 : }
13520 973797 : done = true;
13521 : }
13522 :
13523 973797 : if (SLP_TREE_TYPE (slp_node) != store_vec_info_type
13524 428075 : && (!slp_node->ldst_lanes || SLP_TREE_PERMUTE_P (slp_node)))
13525 : {
13526 : /* Handle stmts whose DEF is used outside the loop-nest that is
13527 : being vectorized. */
13528 577937 : for (unsigned lane : SLP_TREE_LIVE_LANES (slp_node))
13529 : {
13530 61744 : stmt_vec_info slp_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[lane];
13531 61744 : done = vectorizable_live_operation (vinfo, slp_stmt_info, slp_node,
13532 : slp_node_instance, lane,
13533 : true, NULL);
13534 61744 : gcc_assert (done);
13535 : }
13536 : }
13537 :
13538 973797 : return is_store;
13539 : }
13540 :
13541 :
13542 : /* Remove a group of stores (for SLP or interleaving), free their
13543 : stmt_vec_info. */
13544 :
13545 : void
13546 0 : vect_remove_stores (vec_info *vinfo, stmt_vec_info first_stmt_info)
13547 : {
13548 0 : stmt_vec_info next_stmt_info = first_stmt_info;
13549 :
13550 0 : while (next_stmt_info)
13551 : {
13552 0 : stmt_vec_info tmp = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
13553 0 : next_stmt_info = vect_orig_stmt (next_stmt_info);
13554 : /* Free the attached stmt_vec_info and remove the stmt. */
13555 0 : vinfo->remove_stmt (next_stmt_info);
13556 0 : next_stmt_info = tmp;
13557 : }
13558 0 : }
13559 :
13560 : /* If NUNITS is nonzero, return a vector type that contains NUNITS
13561 : elements of type SCALAR_TYPE, or null if the target doesn't support
13562 : such a type.
13563 :
13564 : If NUNITS is zero, return a vector type that contains elements of
13565 : type SCALAR_TYPE, choosing whichever vector size the target prefers.
13566 :
13567 : If PREVAILING_MODE is VOIDmode, we have not yet chosen a vector mode
13568 : for this vectorization region and want to "autodetect" the best choice.
13569 : Otherwise, PREVAILING_MODE is a previously-chosen vector TYPE_MODE
13570 : and we want the new type to be interoperable with it. PREVAILING_MODE
13571 : in this case can be a scalar integer mode or a vector mode; when it
13572 : is a vector mode, the function acts like a tree-level version of
13573 : related_vector_mode. */
13574 :
13575 : tree
13576 30868752 : get_related_vectype_for_scalar_type (machine_mode prevailing_mode,
13577 : tree scalar_type, poly_uint64 nunits)
13578 : {
13579 30868752 : tree orig_scalar_type = scalar_type;
13580 30868752 : scalar_mode inner_mode;
13581 30868752 : machine_mode simd_mode;
13582 30868752 : tree vectype;
13583 :
13584 30868752 : if ((!INTEGRAL_TYPE_P (scalar_type)
13585 10449276 : && !POINTER_TYPE_P (scalar_type)
13586 1792826 : && !SCALAR_FLOAT_TYPE_P (scalar_type))
13587 40813796 : || (!is_int_mode (TYPE_MODE (scalar_type), &inner_mode)
13588 1288676 : && !is_float_mode (TYPE_MODE (scalar_type), &inner_mode)))
13589 507446 : return NULL_TREE;
13590 :
13591 30361306 : unsigned int nbytes = GET_MODE_SIZE (inner_mode);
13592 :
13593 : /* Interoperability between modes requires one to be a constant multiple
13594 : of the other, so that the number of vectors required for each operation
13595 : is a compile-time constant. */
13596 30361306 : if (prevailing_mode != VOIDmode
13597 29239369 : && !constant_multiple_p (nunits * nbytes,
13598 29239369 : GET_MODE_SIZE (prevailing_mode))
13599 31840275 : && !constant_multiple_p (GET_MODE_SIZE (prevailing_mode),
13600 1478969 : nunits * nbytes))
13601 : return NULL_TREE;
13602 :
13603 : /* For vector types of elements whose mode precision doesn't
13604 : match their types precision we use a element type of mode
13605 : precision. The vectorization routines will have to make sure
13606 : they support the proper result truncation/extension.
13607 : We also make sure to build vector types with INTEGER_TYPE
13608 : component type only. */
13609 30361306 : if (INTEGRAL_TYPE_P (scalar_type)
13610 50780700 : && (GET_MODE_BITSIZE (inner_mode) != TYPE_PRECISION (scalar_type)
13611 18940967 : || TREE_CODE (scalar_type) != INTEGER_TYPE))
13612 1687312 : scalar_type = build_nonstandard_integer_type (GET_MODE_BITSIZE (inner_mode),
13613 1687312 : TYPE_UNSIGNED (scalar_type));
13614 :
13615 : /* We shouldn't end up building VECTOR_TYPEs of non-scalar components.
13616 : When the component mode passes the above test simply use a type
13617 : corresponding to that mode. The theory is that any use that
13618 : would cause problems with this will disable vectorization anyway. */
13619 28673994 : else if (!SCALAR_FLOAT_TYPE_P (scalar_type)
13620 : && !INTEGRAL_TYPE_P (scalar_type))
13621 8656450 : scalar_type = lang_hooks.types.type_for_mode (inner_mode, 1);
13622 :
13623 : /* We can't build a vector type of elements with alignment bigger than
13624 : their size. */
13625 20017544 : else if (nbytes < TYPE_ALIGN_UNIT (scalar_type))
13626 380420 : scalar_type = lang_hooks.types.type_for_mode (inner_mode,
13627 190210 : TYPE_UNSIGNED (scalar_type));
13628 :
13629 : /* If we felt back to using the mode fail if there was
13630 : no scalar type for it. */
13631 30361306 : if (scalar_type == NULL_TREE)
13632 : return NULL_TREE;
13633 :
13634 : /* If no prevailing mode was supplied, use the mode the target prefers.
13635 : Otherwise lookup a vector mode based on the prevailing mode. */
13636 30361306 : if (prevailing_mode == VOIDmode)
13637 : {
13638 1121937 : gcc_assert (known_eq (nunits, 0U));
13639 1121937 : simd_mode = targetm.vectorize.preferred_simd_mode (inner_mode);
13640 1121937 : if (SCALAR_INT_MODE_P (simd_mode))
13641 : {
13642 : /* Traditional behavior is not to take the integer mode
13643 : literally, but simply to use it as a way of determining
13644 : the vector size. It is up to mode_for_vector to decide
13645 : what the TYPE_MODE should be.
13646 :
13647 : Note that nunits == 1 is allowed in order to support single
13648 : element vector types. */
13649 58340 : if (!multiple_p (GET_MODE_SIZE (simd_mode), nbytes, &nunits)
13650 550 : || !mode_for_vector (inner_mode, nunits).exists (&simd_mode))
13651 28620 : return NULL_TREE;
13652 : }
13653 : }
13654 29239369 : else if (SCALAR_INT_MODE_P (prevailing_mode)
13655 29239369 : || !related_vector_mode (prevailing_mode,
13656 27220739 : inner_mode, nunits).exists (&simd_mode))
13657 : {
13658 : /* Fall back to using mode_for_vector, mostly in the hope of being
13659 : able to use an integer mode. */
13660 2018630 : if (known_eq (nunits, 0U)
13661 4716952 : && !multiple_p (GET_MODE_SIZE (prevailing_mode), nbytes, &nunits))
13662 : return NULL_TREE;
13663 :
13664 150886 : if (!mode_for_vector (inner_mode, nunits).exists (&simd_mode))
13665 140885 : return NULL_TREE;
13666 : }
13667 :
13668 28324057 : vectype = build_vector_type_for_mode (scalar_type, simd_mode);
13669 :
13670 : /* In cases where the mode was chosen by mode_for_vector, check that
13671 : the target actually supports the chosen mode, or that it at least
13672 : allows the vector mode to be replaced by a like-sized integer. */
13673 56648114 : if (!VECTOR_MODE_P (TYPE_MODE (vectype))
13674 28334315 : && !INTEGRAL_MODE_P (TYPE_MODE (vectype)))
13675 : return NULL_TREE;
13676 :
13677 : /* Re-attach the address-space qualifier if we canonicalized the scalar
13678 : type. */
13679 28315978 : if (TYPE_ADDR_SPACE (orig_scalar_type) != TYPE_ADDR_SPACE (vectype))
13680 5 : return build_qualified_type
13681 5 : (vectype, KEEP_QUAL_ADDR_SPACE (TYPE_QUALS (orig_scalar_type)));
13682 :
13683 : return vectype;
13684 : }
13685 :
13686 : /* Function get_vectype_for_scalar_type.
13687 :
13688 : Returns the vector type corresponding to SCALAR_TYPE as supported
13689 : by the target. If GROUP_SIZE is nonzero and we're performing BB
13690 : vectorization, make sure that the number of elements in the vector
13691 : is no bigger than GROUP_SIZE. */
13692 :
13693 : tree
13694 26394302 : get_vectype_for_scalar_type (vec_info *vinfo, tree scalar_type,
13695 : unsigned int group_size)
13696 : {
13697 : /* For BB vectorization, we should always have a group size once we've
13698 : constructed the SLP tree; the only valid uses of zero GROUP_SIZEs
13699 : are tentative requests during things like early data reference
13700 : analysis and pattern recognition. */
13701 26394302 : if (is_a <bb_vec_info> (vinfo))
13702 23498109 : gcc_assert (vinfo->slp_instances.is_empty () || group_size != 0);
13703 : else
13704 : group_size = 0;
13705 :
13706 26394302 : tree vectype = get_related_vectype_for_scalar_type (vinfo->vector_mode,
13707 : scalar_type);
13708 26394302 : if (vectype && vinfo->vector_mode == VOIDmode)
13709 1046687 : vinfo->vector_mode = TYPE_MODE (vectype);
13710 :
13711 : /* Register the natural choice of vector type, before the group size
13712 : has been applied. */
13713 0 : if (vectype)
13714 24003857 : vinfo->used_vector_modes.add (TYPE_MODE (vectype));
13715 :
13716 : /* If the natural choice of vector type doesn't satisfy GROUP_SIZE,
13717 : try again with an explicit number of elements. */
13718 24003857 : if (vectype
13719 24003857 : && group_size
13720 26394302 : && maybe_ge (TYPE_VECTOR_SUBPARTS (vectype), group_size))
13721 : {
13722 : /* Start with the biggest number of units that fits within
13723 : GROUP_SIZE and halve it until we find a valid vector type.
13724 : Usually either the first attempt will succeed or all will
13725 : fail (in the latter case because GROUP_SIZE is too small
13726 : for the target), but it's possible that a target could have
13727 : a hole between supported vector types.
13728 :
13729 : If GROUP_SIZE is not a power of 2, this has the effect of
13730 : trying the largest power of 2 that fits within the group,
13731 : even though the group is not a multiple of that vector size.
13732 : The BB vectorizer will then try to carve up the group into
13733 : smaller pieces. */
13734 3043572 : unsigned int nunits = 1 << floor_log2 (group_size);
13735 3043572 : do
13736 : {
13737 3043572 : vectype = get_related_vectype_for_scalar_type (vinfo->vector_mode,
13738 3043572 : scalar_type, nunits);
13739 3043572 : nunits /= 2;
13740 : }
13741 3043572 : while (nunits > 1 && !vectype);
13742 : }
13743 :
13744 26394302 : return vectype;
13745 : }
13746 :
13747 : /* Return the vector type corresponding to SCALAR_TYPE as supported
13748 : by the target. NODE, if nonnull, is the SLP tree node that will
13749 : use the returned vector type. */
13750 :
13751 : tree
13752 162378 : get_vectype_for_scalar_type (vec_info *vinfo, tree scalar_type, slp_tree node)
13753 : {
13754 162378 : unsigned int group_size = 0;
13755 162378 : if (node)
13756 162378 : group_size = SLP_TREE_LANES (node);
13757 162378 : return get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
13758 : }
13759 :
13760 : /* Function get_mask_type_for_scalar_type.
13761 :
13762 : Returns the mask type corresponding to a result of comparison
13763 : of vectors of specified SCALAR_TYPE as supported by target.
13764 : If GROUP_SIZE is nonzero and we're performing BB vectorization,
13765 : make sure that the number of elements in the vector is no bigger
13766 : than GROUP_SIZE. */
13767 :
13768 : tree
13769 1101682 : get_mask_type_for_scalar_type (vec_info *vinfo, tree scalar_type,
13770 : unsigned int group_size)
13771 : {
13772 1101682 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
13773 :
13774 1101682 : if (!vectype)
13775 : return NULL;
13776 :
13777 1082282 : return truth_type_for (vectype);
13778 : }
13779 :
13780 : /* Function get_mask_type_for_scalar_type.
13781 :
13782 : Returns the mask type corresponding to a result of comparison
13783 : of vectors of specified SCALAR_TYPE as supported by target.
13784 : NODE, if nonnull, is the SLP tree node that will use the returned
13785 : vector type. */
13786 :
13787 : tree
13788 19 : get_mask_type_for_scalar_type (vec_info *vinfo, tree scalar_type,
13789 : slp_tree node)
13790 : {
13791 19 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, node);
13792 :
13793 19 : if (!vectype)
13794 : return NULL;
13795 :
13796 19 : return truth_type_for (vectype);
13797 : }
13798 :
13799 : /* Function get_same_sized_vectype
13800 :
13801 : Returns a vector type corresponding to SCALAR_TYPE of size
13802 : VECTOR_TYPE if supported by the target. */
13803 :
13804 : tree
13805 157942 : get_same_sized_vectype (tree scalar_type, tree vector_type)
13806 : {
13807 157942 : if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type))
13808 0 : return truth_type_for (vector_type);
13809 :
13810 157942 : poly_uint64 nunits;
13811 315884 : if (!multiple_p (GET_MODE_SIZE (TYPE_MODE (vector_type)),
13812 315884 : GET_MODE_SIZE (TYPE_MODE (scalar_type)), &nunits))
13813 : return NULL_TREE;
13814 :
13815 157942 : return get_related_vectype_for_scalar_type (TYPE_MODE (vector_type),
13816 157942 : scalar_type, nunits);
13817 : }
13818 :
13819 : /* Return true if replacing LOOP_VINFO->vector_mode with VECTOR_MODE
13820 : would not change the chosen vector modes. */
13821 :
13822 : bool
13823 1577802 : vect_chooses_same_modes_p (vec_info *vinfo, machine_mode vector_mode)
13824 : {
13825 1577802 : for (vec_info::mode_set::iterator i = vinfo->used_vector_modes.begin ();
13826 3598150 : i != vinfo->used_vector_modes.end (); ++i)
13827 1848220 : if (!VECTOR_MODE_P (*i)
13828 5544660 : || related_vector_mode (vector_mode, GET_MODE_INNER (*i), 0) != *i)
13829 838046 : return false;
13830 739756 : return true;
13831 : }
13832 :
13833 : /* Return true if replacing VECTOR_MODE with ALT_VECTOR_MODE would not
13834 : change the chosen vector modes for analysis of a loop. */
13835 :
13836 : bool
13837 383842 : vect_chooses_same_modes_p (machine_mode vector_mode,
13838 : machine_mode alt_vector_mode)
13839 : {
13840 63380 : return (VECTOR_MODE_P (vector_mode)
13841 383842 : && VECTOR_MODE_P (alt_vector_mode)
13842 767684 : && (related_vector_mode (vector_mode,
13843 : GET_MODE_INNER (alt_vector_mode))
13844 383842 : == alt_vector_mode)
13845 410172 : && (related_vector_mode (alt_vector_mode,
13846 : GET_MODE_INNER (vector_mode))
13847 13165 : == vector_mode));
13848 : }
13849 :
13850 : /* Function vect_is_simple_use.
13851 :
13852 : Input:
13853 : VINFO - the vect info of the loop or basic block that is being vectorized.
13854 : OPERAND - operand in the loop or bb.
13855 : Output:
13856 : DEF_STMT_INFO_OUT (optional) - information about the defining stmt in
13857 : case OPERAND is an SSA_NAME that is defined in the vectorizable region
13858 : DEF_STMT_OUT (optional) - the defining stmt in case OPERAND is an SSA_NAME;
13859 : the definition could be anywhere in the function
13860 : DT - the type of definition
13861 :
13862 : Returns whether a stmt with OPERAND can be vectorized.
13863 : For loops, supportable operands are constants, loop invariants, and operands
13864 : that are defined by the current iteration of the loop. Unsupportable
13865 : operands are those that are defined by a previous iteration of the loop (as
13866 : is the case in reduction/induction computations).
13867 : For basic blocks, supportable operands are constants and bb invariants.
13868 : For now, operands defined outside the basic block are not supported. */
13869 :
13870 : bool
13871 41810148 : vect_is_simple_use (tree operand, vec_info *vinfo, enum vect_def_type *dt,
13872 : stmt_vec_info *def_stmt_info_out, gimple **def_stmt_out)
13873 : {
13874 41810148 : if (def_stmt_info_out)
13875 39610715 : *def_stmt_info_out = NULL;
13876 41810148 : if (def_stmt_out)
13877 9790853 : *def_stmt_out = NULL;
13878 41810148 : *dt = vect_unknown_def_type;
13879 :
13880 41810148 : if (dump_enabled_p ())
13881 : {
13882 767862 : dump_printf_loc (MSG_NOTE, vect_location,
13883 : "vect_is_simple_use: operand ");
13884 767862 : if (TREE_CODE (operand) == SSA_NAME
13885 767862 : && !SSA_NAME_IS_DEFAULT_DEF (operand))
13886 704772 : dump_gimple_expr (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (operand), 0);
13887 : else
13888 63090 : dump_generic_expr (MSG_NOTE, TDF_SLIM, operand);
13889 : }
13890 :
13891 41810148 : if (CONSTANT_CLASS_P (operand))
13892 2799388 : *dt = vect_constant_def;
13893 39010760 : else if (is_gimple_min_invariant (operand))
13894 333419 : *dt = vect_external_def;
13895 38677341 : else if (TREE_CODE (operand) != SSA_NAME)
13896 976 : *dt = vect_unknown_def_type;
13897 38676365 : else if (SSA_NAME_IS_DEFAULT_DEF (operand))
13898 505774 : *dt = vect_external_def;
13899 : else
13900 : {
13901 38170591 : gimple *def_stmt = SSA_NAME_DEF_STMT (operand);
13902 38170591 : stmt_vec_info stmt_vinfo = vinfo->lookup_def (operand);
13903 38170591 : if (!stmt_vinfo)
13904 838187 : *dt = vect_external_def;
13905 : else
13906 : {
13907 37332404 : stmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
13908 37332404 : def_stmt = stmt_vinfo->stmt;
13909 37332404 : *dt = STMT_VINFO_DEF_TYPE (stmt_vinfo);
13910 37332404 : if (def_stmt_info_out)
13911 35142017 : *def_stmt_info_out = stmt_vinfo;
13912 : }
13913 38170591 : if (def_stmt_out)
13914 9583578 : *def_stmt_out = def_stmt;
13915 : }
13916 :
13917 41810148 : if (dump_enabled_p ())
13918 : {
13919 767862 : dump_printf (MSG_NOTE, ", type of def: ");
13920 767862 : switch (*dt)
13921 : {
13922 0 : case vect_uninitialized_def:
13923 0 : dump_printf (MSG_NOTE, "uninitialized\n");
13924 0 : break;
13925 52362 : case vect_constant_def:
13926 52362 : dump_printf (MSG_NOTE, "constant\n");
13927 52362 : break;
13928 26314 : case vect_external_def:
13929 26314 : dump_printf (MSG_NOTE, "external\n");
13930 26314 : break;
13931 549381 : case vect_internal_def:
13932 549381 : dump_printf (MSG_NOTE, "internal\n");
13933 549381 : break;
13934 108263 : case vect_induction_def:
13935 108263 : dump_printf (MSG_NOTE, "induction\n");
13936 108263 : break;
13937 28177 : case vect_reduction_def:
13938 28177 : dump_printf (MSG_NOTE, "reduction\n");
13939 28177 : break;
13940 482 : case vect_double_reduction_def:
13941 482 : dump_printf (MSG_NOTE, "double reduction\n");
13942 482 : break;
13943 2173 : case vect_nested_cycle:
13944 2173 : dump_printf (MSG_NOTE, "nested cycle\n");
13945 2173 : break;
13946 276 : case vect_first_order_recurrence:
13947 276 : dump_printf (MSG_NOTE, "first order recurrence\n");
13948 276 : break;
13949 0 : case vect_condition_def:
13950 0 : dump_printf (MSG_NOTE, "control flow\n");
13951 0 : break;
13952 434 : case vect_unknown_def_type:
13953 434 : dump_printf (MSG_NOTE, "unknown\n");
13954 434 : break;
13955 : }
13956 : }
13957 :
13958 41810148 : if (*dt == vect_unknown_def_type)
13959 : {
13960 57297 : if (dump_enabled_p ())
13961 434 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
13962 : "Unsupported pattern.\n");
13963 57297 : return false;
13964 : }
13965 :
13966 : return true;
13967 : }
13968 :
13969 : /* Function vect_is_simple_use.
13970 :
13971 : Same as vect_is_simple_use but determines the operand by operand
13972 : position OPERAND from either STMT or SLP_NODE, filling in *OP
13973 : and *SLP_DEF (when SLP_NODE is not NULL). */
13974 :
13975 : bool
13976 3844176 : vect_is_simple_use (vec_info *vinfo, slp_tree slp_node,
13977 : unsigned operand, tree *op, slp_tree *slp_def,
13978 : enum vect_def_type *dt,
13979 : tree *vectype, stmt_vec_info *def_stmt_info_out)
13980 : {
13981 3844176 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[operand];
13982 3844176 : *slp_def = child;
13983 3844176 : *vectype = SLP_TREE_VECTYPE (child);
13984 3844176 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
13985 : {
13986 : /* ??? VEC_PERM nodes might be intermediate and their lane value
13987 : have no representative (nor do we build a VEC_PERM stmt for
13988 : the actual operation). Note for two-operator nodes we set
13989 : a representative but leave scalar stmts empty as we'd only
13990 : have one for a subset of lanes. Ideally no caller would
13991 : require *op for internal defs. */
13992 2132590 : if (SLP_TREE_REPRESENTATIVE (child))
13993 : {
13994 2131793 : *op = gimple_get_lhs (SLP_TREE_REPRESENTATIVE (child)->stmt);
13995 2131793 : return vect_is_simple_use (*op, vinfo, dt, def_stmt_info_out);
13996 : }
13997 : else
13998 : {
13999 797 : gcc_assert (SLP_TREE_PERMUTE_P (child));
14000 797 : *op = error_mark_node;
14001 797 : *dt = vect_internal_def;
14002 797 : if (def_stmt_info_out)
14003 0 : *def_stmt_info_out = NULL;
14004 797 : return true;
14005 : }
14006 : }
14007 : else
14008 : {
14009 1711586 : if (def_stmt_info_out)
14010 55502 : *def_stmt_info_out = NULL;
14011 1711586 : *op = SLP_TREE_SCALAR_OPS (child)[0];
14012 1711586 : *dt = SLP_TREE_DEF_TYPE (child);
14013 1711586 : return true;
14014 : }
14015 : }
14016 :
14017 : /* If OP is not NULL and is external or constant update its vector
14018 : type with VECTYPE. Returns true if successful or false if not,
14019 : for example when conflicting vector types are present. */
14020 :
14021 : bool
14022 3495782 : vect_maybe_update_slp_op_vectype (slp_tree op, tree vectype)
14023 : {
14024 3495782 : if (!op || SLP_TREE_DEF_TYPE (op) == vect_internal_def)
14025 : return true;
14026 1150283 : if (SLP_TREE_VECTYPE (op))
14027 103392 : return types_compatible_p (SLP_TREE_VECTYPE (op), vectype);
14028 : /* For external defs refuse to produce VECTOR_BOOLEAN_TYPE_P, those
14029 : should be handled by patters. Allow vect_constant_def for now
14030 : as well as the trivial single-lane uniform vect_external_def case
14031 : both of which we code-generate reasonably. */
14032 1046891 : if (VECTOR_BOOLEAN_TYPE_P (vectype)
14033 1534 : && SLP_TREE_DEF_TYPE (op) == vect_external_def
14034 1048037 : && SLP_TREE_LANES (op) > 1)
14035 : return false;
14036 1046732 : SLP_TREE_VECTYPE (op) = vectype;
14037 1046732 : return true;
14038 : }
14039 :
14040 : /* Function supportable_widening_operation
14041 :
14042 : Check whether an operation represented by the code CODE is a
14043 : widening operation that is supported by the target platform in
14044 : vector form (i.e., when operating on arguments of type VECTYPE_IN
14045 : producing a result of type VECTYPE_OUT).
14046 :
14047 : Widening operations we currently support are NOP (CONVERT), FLOAT,
14048 : FIX_TRUNC and WIDEN_MULT. This function checks if these operations
14049 : are supported by the target platform either directly (via vector
14050 : tree-codes), or via target builtins.
14051 :
14052 : When EVENODD_OK then also lane-swizzling operations are considered.
14053 :
14054 : Output:
14055 : - CODE1 and CODE2 are codes of vector operations to be used when
14056 : vectorizing the operation, if available.
14057 : - MULTI_STEP_CVT determines the number of required intermediate steps in
14058 : case of multi-step conversion (like char->short->int - in that case
14059 : MULTI_STEP_CVT will be 1).
14060 : - INTERM_TYPES contains the intermediate type required to perform the
14061 : widening operation (short in the above example). */
14062 :
14063 : bool
14064 485688 : supportable_widening_operation (code_helper code,
14065 : tree vectype_out, tree vectype_in,
14066 : bool evenodd_ok,
14067 : code_helper *code1,
14068 : code_helper *code2,
14069 : int *multi_step_cvt,
14070 : vec<tree> *interm_types)
14071 : {
14072 485688 : machine_mode vec_mode;
14073 485688 : enum insn_code icode1, icode2;
14074 485688 : optab optab1 = unknown_optab, optab2 = unknown_optab;
14075 485688 : tree vectype = vectype_in;
14076 485688 : tree wide_vectype = vectype_out;
14077 485688 : tree_code c1 = MAX_TREE_CODES, c2 = MAX_TREE_CODES;
14078 485688 : int i;
14079 485688 : tree prev_type, intermediate_type;
14080 485688 : machine_mode intermediate_mode, prev_mode;
14081 485688 : optab optab3, optab4;
14082 :
14083 485688 : *multi_step_cvt = 0;
14084 :
14085 485688 : switch (code.safe_as_tree_code ())
14086 : {
14087 : case MAX_TREE_CODES:
14088 : /* Don't set c1 and c2 if code is not a tree_code. */
14089 : break;
14090 :
14091 186703 : case WIDEN_MULT_EXPR:
14092 : /* The result of a vectorized widening operation usually requires
14093 : two vectors (because the widened results do not fit into one vector).
14094 : The generated vector results would normally be expected to be
14095 : generated in the same order as in the original scalar computation,
14096 : i.e. if 8 results are generated in each vector iteration, they are
14097 : to be organized as follows:
14098 : vect1: [res1,res2,res3,res4],
14099 : vect2: [res5,res6,res7,res8].
14100 :
14101 : However, in the special case that the result of the widening
14102 : operation is used in a reduction computation only, the order doesn't
14103 : matter (because when vectorizing a reduction we change the order of
14104 : the computation). Some targets can take advantage of this and
14105 : generate more efficient code. For example, targets like Altivec,
14106 : that support widen_mult using a sequence of {mult_even,mult_odd}
14107 : generate the following vectors:
14108 : vect1: [res1,res3,res5,res7],
14109 : vect2: [res2,res4,res6,res8].
14110 :
14111 : When vectorizing outer-loops, we execute the inner-loop sequentially
14112 : (each vectorized inner-loop iteration contributes to VF outer-loop
14113 : iterations in parallel). We therefore don't allow to change the
14114 : order of the computation in the inner-loop during outer-loop
14115 : vectorization. */
14116 : /* TODO: Another case in which order doesn't *really* matter is when we
14117 : widen and then contract again, e.g. (short)((int)x * y >> 8).
14118 : Normally, pack_trunc performs an even/odd permute, whereas the
14119 : repack from an even/odd expansion would be an interleave, which
14120 : would be significantly simpler for e.g. AVX2. */
14121 : /* In any case, in order to avoid duplicating the code below, recurse
14122 : on VEC_WIDEN_MULT_EVEN_EXPR. If it succeeds, all the return values
14123 : are properly set up for the caller. If we fail, we'll continue with
14124 : a VEC_WIDEN_MULT_LO/HI_EXPR check. */
14125 186703 : if (evenodd_ok
14126 186703 : && supportable_widening_operation (VEC_WIDEN_MULT_EVEN_EXPR,
14127 : vectype_out, vectype_in,
14128 : evenodd_ok, code1,
14129 : code2, multi_step_cvt,
14130 : interm_types))
14131 97993 : return true;
14132 : c1 = VEC_WIDEN_MULT_LO_EXPR;
14133 : c2 = VEC_WIDEN_MULT_HI_EXPR;
14134 : break;
14135 :
14136 : case DOT_PROD_EXPR:
14137 387695 : c1 = DOT_PROD_EXPR;
14138 387695 : c2 = DOT_PROD_EXPR;
14139 : break;
14140 :
14141 0 : case SAD_EXPR:
14142 0 : c1 = SAD_EXPR;
14143 0 : c2 = SAD_EXPR;
14144 0 : break;
14145 :
14146 184759 : case VEC_WIDEN_MULT_EVEN_EXPR:
14147 : /* Support the recursion induced just above. */
14148 184759 : c1 = VEC_WIDEN_MULT_EVEN_EXPR;
14149 184759 : c2 = VEC_WIDEN_MULT_ODD_EXPR;
14150 184759 : break;
14151 :
14152 9408 : case WIDEN_LSHIFT_EXPR:
14153 9408 : c1 = VEC_WIDEN_LSHIFT_LO_EXPR;
14154 9408 : c2 = VEC_WIDEN_LSHIFT_HI_EXPR;
14155 9408 : break;
14156 :
14157 40985 : CASE_CONVERT:
14158 40985 : c1 = VEC_UNPACK_LO_EXPR;
14159 40985 : c2 = VEC_UNPACK_HI_EXPR;
14160 40985 : break;
14161 :
14162 9195 : case FLOAT_EXPR:
14163 9195 : c1 = VEC_UNPACK_FLOAT_LO_EXPR;
14164 9195 : c2 = VEC_UNPACK_FLOAT_HI_EXPR;
14165 9195 : break;
14166 :
14167 119 : case FIX_TRUNC_EXPR:
14168 119 : c1 = VEC_UNPACK_FIX_TRUNC_LO_EXPR;
14169 119 : c2 = VEC_UNPACK_FIX_TRUNC_HI_EXPR;
14170 119 : break;
14171 :
14172 0 : default:
14173 0 : gcc_unreachable ();
14174 : }
14175 :
14176 387695 : if (BYTES_BIG_ENDIAN && c1 != VEC_WIDEN_MULT_EVEN_EXPR)
14177 : std::swap (c1, c2);
14178 :
14179 387695 : if (code == FIX_TRUNC_EXPR)
14180 : {
14181 : /* The signedness is determined from output operand. */
14182 119 : optab1 = optab_for_tree_code (c1, vectype_out, optab_default);
14183 119 : optab2 = optab_for_tree_code (c2, vectype_out, optab_default);
14184 : }
14185 688034 : else if (CONVERT_EXPR_CODE_P (code.safe_as_tree_code ())
14186 40985 : && VECTOR_BOOLEAN_TYPE_P (wide_vectype)
14187 8016 : && VECTOR_BOOLEAN_TYPE_P (vectype)
14188 8016 : && TYPE_MODE (wide_vectype) == TYPE_MODE (vectype)
14189 333521 : && SCALAR_INT_MODE_P (TYPE_MODE (vectype)))
14190 : {
14191 : /* If the input and result modes are the same, a different optab
14192 : is needed where we pass in the number of units in vectype. */
14193 : optab1 = vec_unpacks_sbool_lo_optab;
14194 : optab2 = vec_unpacks_sbool_hi_optab;
14195 : }
14196 :
14197 387695 : vec_mode = TYPE_MODE (vectype);
14198 387695 : if (widening_fn_p (code))
14199 : {
14200 : /* If this is an internal fn then we must check whether the target
14201 : supports either a low-high split or an even-odd split. */
14202 54519 : internal_fn ifn = as_internal_fn ((combined_fn) code);
14203 :
14204 54519 : internal_fn lo, hi, even, odd;
14205 54519 : lookup_hilo_internal_fn (ifn, &lo, &hi);
14206 54519 : if (BYTES_BIG_ENDIAN)
14207 : std::swap (lo, hi);
14208 54519 : *code1 = as_combined_fn (lo);
14209 54519 : *code2 = as_combined_fn (hi);
14210 54519 : optab1 = direct_internal_fn_optab (lo, {vectype, vectype});
14211 54519 : optab2 = direct_internal_fn_optab (hi, {vectype, vectype});
14212 :
14213 : /* If we don't support low-high, then check for even-odd. */
14214 54519 : if (!optab1
14215 54519 : || (icode1 = optab_handler (optab1, vec_mode)) == CODE_FOR_nothing
14216 0 : || !optab2
14217 54519 : || (icode2 = optab_handler (optab2, vec_mode)) == CODE_FOR_nothing)
14218 : {
14219 54519 : lookup_evenodd_internal_fn (ifn, &even, &odd);
14220 54519 : *code1 = as_combined_fn (even);
14221 54519 : *code2 = as_combined_fn (odd);
14222 54519 : optab1 = direct_internal_fn_optab (even, {vectype, vectype});
14223 54519 : optab2 = direct_internal_fn_optab (odd, {vectype, vectype});
14224 : }
14225 : }
14226 333176 : else if (code.is_tree_code ())
14227 : {
14228 333176 : if (code == FIX_TRUNC_EXPR)
14229 : {
14230 : /* The signedness is determined from output operand. */
14231 119 : optab1 = optab_for_tree_code (c1, vectype_out, optab_default);
14232 119 : optab2 = optab_for_tree_code (c2, vectype_out, optab_default);
14233 : }
14234 333057 : else if (CONVERT_EXPR_CODE_P ((tree_code) code.safe_as_tree_code ())
14235 40985 : && VECTOR_BOOLEAN_TYPE_P (wide_vectype)
14236 8016 : && VECTOR_BOOLEAN_TYPE_P (vectype)
14237 8016 : && TYPE_MODE (wide_vectype) == TYPE_MODE (vectype)
14238 333521 : && SCALAR_INT_MODE_P (TYPE_MODE (vectype)))
14239 : {
14240 : /* If the input and result modes are the same, a different optab
14241 : is needed where we pass in the number of units in vectype. */
14242 : optab1 = vec_unpacks_sbool_lo_optab;
14243 : optab2 = vec_unpacks_sbool_hi_optab;
14244 : }
14245 : else
14246 : {
14247 332593 : optab1 = optab_for_tree_code (c1, vectype, optab_default);
14248 332593 : optab2 = optab_for_tree_code (c2, vectype, optab_default);
14249 : }
14250 333176 : *code1 = c1;
14251 333176 : *code2 = c2;
14252 : }
14253 :
14254 387695 : if (!optab1 || !optab2)
14255 : return false;
14256 :
14257 387695 : if ((icode1 = optab_handler (optab1, vec_mode)) == CODE_FOR_nothing
14258 387695 : || (icode2 = optab_handler (optab2, vec_mode)) == CODE_FOR_nothing)
14259 230587 : return false;
14260 :
14261 :
14262 157108 : if (insn_data[icode1].operand[0].mode == TYPE_MODE (wide_vectype)
14263 157108 : && insn_data[icode2].operand[0].mode == TYPE_MODE (wide_vectype))
14264 : {
14265 145633 : if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14266 : return true;
14267 : /* For scalar masks we may have different boolean
14268 : vector types having the same QImode. Thus we
14269 : add additional check for elements number. */
14270 4229 : if (known_eq (TYPE_VECTOR_SUBPARTS (vectype),
14271 : TYPE_VECTOR_SUBPARTS (wide_vectype) * 2))
14272 : return true;
14273 : }
14274 :
14275 : /* Check if it's a multi-step conversion that can be done using intermediate
14276 : types. */
14277 :
14278 11680 : prev_type = vectype;
14279 11680 : prev_mode = vec_mode;
14280 :
14281 242552 : if (!CONVERT_EXPR_CODE_P (code.safe_as_tree_code ()))
14282 : return false;
14283 :
14284 : /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
14285 : intermediate steps in promotion sequence. We try
14286 : MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do
14287 : not. */
14288 11628 : interm_types->create (MAX_INTERM_CVT_STEPS);
14289 13018 : for (i = 0; i < MAX_INTERM_CVT_STEPS; i++)
14290 : {
14291 13018 : intermediate_mode = insn_data[icode1].operand[0].mode;
14292 13018 : if (VECTOR_BOOLEAN_TYPE_P (prev_type))
14293 4795 : intermediate_type
14294 4795 : = vect_halve_mask_nunits (prev_type, intermediate_mode);
14295 8223 : else if (VECTOR_MODE_P (intermediate_mode))
14296 : {
14297 8223 : tree intermediate_element_type
14298 8223 : = lang_hooks.types.type_for_mode (GET_MODE_INNER (intermediate_mode),
14299 8223 : TYPE_UNSIGNED (prev_type));
14300 8223 : intermediate_type
14301 8223 : = build_vector_type_for_mode (intermediate_element_type,
14302 : intermediate_mode);
14303 8223 : }
14304 : else
14305 0 : intermediate_type
14306 0 : = lang_hooks.types.type_for_mode (intermediate_mode,
14307 0 : TYPE_UNSIGNED (prev_type));
14308 :
14309 13018 : if (VECTOR_BOOLEAN_TYPE_P (intermediate_type)
14310 4795 : && VECTOR_BOOLEAN_TYPE_P (wide_vectype)
14311 4795 : && intermediate_mode == TYPE_MODE (wide_vectype)
14312 13287 : && SCALAR_INT_MODE_P (intermediate_mode))
14313 : {
14314 : /* If the input and result modes are the same, a different optab
14315 : is needed where we pass in the number of units in vectype. */
14316 : optab3 = vec_unpacks_sbool_lo_optab;
14317 : optab4 = vec_unpacks_sbool_hi_optab;
14318 : }
14319 : else
14320 : {
14321 12749 : optab3 = optab_for_tree_code (c1, intermediate_type, optab_default);
14322 12749 : optab4 = optab_for_tree_code (c2, intermediate_type, optab_default);
14323 : }
14324 :
14325 13018 : if (!optab3 || !optab4
14326 13018 : || (icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing
14327 12986 : || insn_data[icode1].operand[0].mode != intermediate_mode
14328 12986 : || (icode2 = optab_handler (optab2, prev_mode)) == CODE_FOR_nothing
14329 12986 : || insn_data[icode2].operand[0].mode != intermediate_mode
14330 12986 : || ((icode1 = optab_handler (optab3, intermediate_mode))
14331 : == CODE_FOR_nothing)
14332 25751 : || ((icode2 = optab_handler (optab4, intermediate_mode))
14333 : == CODE_FOR_nothing))
14334 : break;
14335 :
14336 12733 : interm_types->quick_push (intermediate_type);
14337 12733 : (*multi_step_cvt)++;
14338 :
14339 12733 : if (insn_data[icode1].operand[0].mode == TYPE_MODE (wide_vectype)
14340 12733 : && insn_data[icode2].operand[0].mode == TYPE_MODE (wide_vectype))
14341 : {
14342 11407 : if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14343 : return true;
14344 3785 : if (known_eq (TYPE_VECTOR_SUBPARTS (intermediate_type),
14345 : TYPE_VECTOR_SUBPARTS (wide_vectype) * 2))
14346 : return true;
14347 : }
14348 :
14349 1390 : prev_type = intermediate_type;
14350 1390 : prev_mode = intermediate_mode;
14351 : }
14352 :
14353 285 : interm_types->release ();
14354 285 : return false;
14355 : }
14356 :
14357 :
14358 : /* Function supportable_narrowing_operation
14359 :
14360 : Check whether an operation represented by the code CODE is a
14361 : narrowing operation that is supported by the target platform in
14362 : vector form (i.e., when operating on arguments of type VECTYPE_IN
14363 : and producing a result of type VECTYPE_OUT).
14364 :
14365 : Narrowing operations we currently support are NOP (CONVERT), FIX_TRUNC
14366 : and FLOAT. This function checks if these operations are supported by
14367 : the target platform directly via vector tree-codes.
14368 :
14369 : Output:
14370 : - CODE1 is the code of a vector operation to be used when
14371 : vectorizing the operation, if available.
14372 : - MULTI_STEP_CVT determines the number of required intermediate steps in
14373 : case of multi-step conversion (like int->short->char - in that case
14374 : MULTI_STEP_CVT will be 1).
14375 : - INTERM_TYPES contains the intermediate type required to perform the
14376 : narrowing operation (short in the above example). */
14377 :
14378 : bool
14379 42095 : supportable_narrowing_operation (code_helper code,
14380 : tree vectype_out, tree vectype_in,
14381 : code_helper *code1, int *multi_step_cvt,
14382 : vec<tree> *interm_types)
14383 : {
14384 42095 : machine_mode vec_mode;
14385 42095 : enum insn_code icode1;
14386 42095 : optab optab1, interm_optab;
14387 42095 : tree vectype = vectype_in;
14388 42095 : tree narrow_vectype = vectype_out;
14389 42095 : enum tree_code c1;
14390 42095 : tree intermediate_type, prev_type;
14391 42095 : machine_mode intermediate_mode, prev_mode;
14392 42095 : int i;
14393 42095 : unsigned HOST_WIDE_INT n_elts;
14394 42095 : bool uns;
14395 :
14396 42095 : if (!code.is_tree_code ())
14397 : return false;
14398 :
14399 42095 : *multi_step_cvt = 0;
14400 42095 : switch ((tree_code) code)
14401 : {
14402 41251 : CASE_CONVERT:
14403 41251 : c1 = VEC_PACK_TRUNC_EXPR;
14404 41251 : if (VECTOR_BOOLEAN_TYPE_P (narrow_vectype)
14405 11684 : && VECTOR_BOOLEAN_TYPE_P (vectype)
14406 11684 : && SCALAR_INT_MODE_P (TYPE_MODE (vectype))
14407 5262 : && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&n_elts)
14408 46513 : && n_elts < BITS_PER_UNIT)
14409 : optab1 = vec_pack_sbool_trunc_optab;
14410 : else
14411 38766 : optab1 = optab_for_tree_code (c1, vectype, optab_default);
14412 : break;
14413 :
14414 561 : case FIX_TRUNC_EXPR:
14415 561 : c1 = VEC_PACK_FIX_TRUNC_EXPR;
14416 : /* The signedness is determined from output operand. */
14417 561 : optab1 = optab_for_tree_code (c1, vectype_out, optab_default);
14418 561 : break;
14419 :
14420 283 : case FLOAT_EXPR:
14421 283 : c1 = VEC_PACK_FLOAT_EXPR;
14422 283 : optab1 = optab_for_tree_code (c1, vectype, optab_default);
14423 283 : break;
14424 :
14425 0 : default:
14426 0 : gcc_unreachable ();
14427 : }
14428 :
14429 42095 : if (!optab1)
14430 : return false;
14431 :
14432 42095 : vec_mode = TYPE_MODE (vectype);
14433 42095 : if ((icode1 = optab_handler (optab1, vec_mode)) == CODE_FOR_nothing)
14434 : return false;
14435 :
14436 37834 : *code1 = c1;
14437 :
14438 37834 : if (insn_data[icode1].operand[0].mode == TYPE_MODE (narrow_vectype))
14439 : {
14440 23588 : if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14441 : return true;
14442 : /* For scalar masks we may have different boolean
14443 : vector types having the same QImode. Thus we
14444 : add additional check for elements number. */
14445 5821 : if (known_eq (TYPE_VECTOR_SUBPARTS (vectype) * 2,
14446 : TYPE_VECTOR_SUBPARTS (narrow_vectype)))
14447 : return true;
14448 : }
14449 :
14450 14397 : if (code == FLOAT_EXPR)
14451 : return false;
14452 :
14453 : /* Check if it's a multi-step conversion that can be done using intermediate
14454 : types. */
14455 14397 : prev_mode = vec_mode;
14456 14397 : prev_type = vectype;
14457 14397 : if (code == FIX_TRUNC_EXPR)
14458 94 : uns = TYPE_UNSIGNED (vectype_out);
14459 : else
14460 14303 : uns = TYPE_UNSIGNED (vectype);
14461 :
14462 : /* For multi-step FIX_TRUNC_EXPR prefer signed floating to integer
14463 : conversion over unsigned, as unsigned FIX_TRUNC_EXPR is often more
14464 : costly than signed. */
14465 14397 : if (code == FIX_TRUNC_EXPR && uns)
14466 : {
14467 28 : enum insn_code icode2;
14468 :
14469 28 : intermediate_type
14470 28 : = lang_hooks.types.type_for_mode (TYPE_MODE (vectype_out), 0);
14471 28 : interm_optab
14472 28 : = optab_for_tree_code (c1, intermediate_type, optab_default);
14473 28 : if (interm_optab != unknown_optab
14474 28 : && (icode2 = optab_handler (optab1, vec_mode)) != CODE_FOR_nothing
14475 28 : && insn_data[icode1].operand[0].mode
14476 28 : == insn_data[icode2].operand[0].mode)
14477 : {
14478 : uns = false;
14479 : optab1 = interm_optab;
14480 : icode1 = icode2;
14481 : }
14482 : }
14483 :
14484 : /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
14485 : intermediate steps in promotion sequence. We try
14486 : MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do not. */
14487 14397 : interm_types->create (MAX_INTERM_CVT_STEPS);
14488 30940 : for (i = 0; i < MAX_INTERM_CVT_STEPS; i++)
14489 : {
14490 16543 : intermediate_mode = insn_data[icode1].operand[0].mode;
14491 16543 : if (VECTOR_BOOLEAN_TYPE_P (prev_type))
14492 7208 : intermediate_type
14493 7208 : = vect_double_mask_nunits (prev_type, intermediate_mode);
14494 : else
14495 9335 : intermediate_type
14496 9335 : = lang_hooks.types.type_for_mode (intermediate_mode, uns);
14497 16543 : if (VECTOR_BOOLEAN_TYPE_P (intermediate_type)
14498 7208 : && VECTOR_BOOLEAN_TYPE_P (prev_type)
14499 7208 : && SCALAR_INT_MODE_P (prev_mode)
14500 3134 : && TYPE_VECTOR_SUBPARTS (intermediate_type).is_constant (&n_elts)
14501 19677 : && n_elts < BITS_PER_UNIT)
14502 : interm_optab = vec_pack_sbool_trunc_optab;
14503 : else
14504 16189 : interm_optab
14505 16189 : = optab_for_tree_code (VEC_PACK_TRUNC_EXPR, intermediate_type,
14506 : optab_default);
14507 354 : if (!interm_optab
14508 16543 : || ((icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing)
14509 16543 : || insn_data[icode1].operand[0].mode != intermediate_mode
14510 32732 : || ((icode1 = optab_handler (interm_optab, intermediate_mode))
14511 : == CODE_FOR_nothing))
14512 : break;
14513 :
14514 15628 : interm_types->quick_push (intermediate_type);
14515 15628 : (*multi_step_cvt)++;
14516 :
14517 15628 : if (insn_data[icode1].operand[0].mode == TYPE_MODE (narrow_vectype))
14518 : {
14519 13482 : if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14520 : return true;
14521 5030 : if (known_eq (TYPE_VECTOR_SUBPARTS (intermediate_type) * 2,
14522 : TYPE_VECTOR_SUBPARTS (narrow_vectype)))
14523 : return true;
14524 : }
14525 :
14526 2146 : prev_mode = intermediate_mode;
14527 2146 : prev_type = intermediate_type;
14528 2146 : optab1 = interm_optab;
14529 : }
14530 :
14531 915 : interm_types->release ();
14532 915 : return false;
14533 : }
14534 :
14535 : /* Function supportable_indirect_convert_operation
14536 :
14537 : Check whether an operation represented by the code CODE is single or multi
14538 : operations that are supported by the target platform in
14539 : vector form (i.e., when operating on arguments of type VECTYPE_IN
14540 : producing a result of type VECTYPE_OUT).
14541 :
14542 : Convert operations we currently support directly are FIX_TRUNC and FLOAT.
14543 : This function checks if these operations are supported
14544 : by the target platform directly (via vector tree-codes).
14545 :
14546 : Output:
14547 : - converts contains some pairs to perform the convert operation,
14548 : the pair's first is the intermediate type, and its second is the code of
14549 : a vector operation to be used when converting the operation from the
14550 : previous type to the intermediate type. */
14551 : bool
14552 85839 : supportable_indirect_convert_operation (code_helper code,
14553 : tree vectype_out,
14554 : tree vectype_in,
14555 : vec<std::pair<tree, tree_code> > &converts,
14556 : tree op0, slp_tree slp_op0)
14557 : {
14558 85839 : bool found_mode = false;
14559 85839 : scalar_mode lhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_out));
14560 85839 : scalar_mode rhs_mode = GET_MODE_INNER (TYPE_MODE (vectype_in));
14561 85839 : tree_code tc1, tc2, code1, code2;
14562 :
14563 85839 : tree cvt_type = NULL_TREE;
14564 85839 : poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype_in);
14565 :
14566 85839 : if (supportable_convert_operation ((tree_code) code,
14567 : vectype_out,
14568 : vectype_in,
14569 : &tc1))
14570 : {
14571 19182 : converts.safe_push (std::make_pair (vectype_out, tc1));
14572 19182 : return true;
14573 : }
14574 :
14575 : /* For conversions between float and integer types try whether
14576 : we can use intermediate signed integer types to support the
14577 : conversion. */
14578 133314 : if (GET_MODE_SIZE (lhs_mode) != GET_MODE_SIZE (rhs_mode)
14579 66657 : && (code == FLOAT_EXPR
14580 3164 : || (code == FIX_TRUNC_EXPR && !flag_trapping_math)))
14581 : {
14582 472 : bool demotion = GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode);
14583 236 : bool float_expr_p = code == FLOAT_EXPR;
14584 236 : unsigned short target_size;
14585 236 : scalar_mode intermediate_mode;
14586 236 : if (demotion)
14587 : {
14588 84 : intermediate_mode = lhs_mode;
14589 84 : target_size = GET_MODE_SIZE (rhs_mode);
14590 : }
14591 : else
14592 : {
14593 152 : target_size = GET_MODE_SIZE (lhs_mode);
14594 152 : if (!int_mode_for_size
14595 152 : (GET_MODE_BITSIZE (rhs_mode), 0).exists (&intermediate_mode))
14596 132 : return false;
14597 : }
14598 236 : code1 = float_expr_p ? (tree_code) code : NOP_EXPR;
14599 : code2 = float_expr_p ? NOP_EXPR : (tree_code) code;
14600 236 : opt_scalar_mode mode_iter;
14601 411 : FOR_EACH_2XWIDER_MODE (mode_iter, intermediate_mode)
14602 : {
14603 411 : intermediate_mode = mode_iter.require ();
14604 :
14605 822 : if (GET_MODE_SIZE (intermediate_mode) > target_size)
14606 : break;
14607 :
14608 345 : scalar_mode cvt_mode;
14609 345 : if (!int_mode_for_size
14610 345 : (GET_MODE_BITSIZE (intermediate_mode), 0).exists (&cvt_mode))
14611 : break;
14612 :
14613 315 : cvt_type = build_nonstandard_integer_type
14614 315 : (GET_MODE_BITSIZE (cvt_mode), 0);
14615 :
14616 : /* Check if the intermediate type can hold OP0's range.
14617 : When converting from float to integer this is not necessary
14618 : because values that do not fit the (smaller) target type are
14619 : unspecified anyway. */
14620 315 : if (demotion && float_expr_p)
14621 : {
14622 8 : wide_int op_min_value, op_max_value;
14623 : /* For vector form, it looks like op0 doesn't have RANGE_INFO.
14624 : In the future, if it is supported, changes may need to be made
14625 : to this part, such as checking the RANGE of each element
14626 : in the vector. */
14627 8 : if (slp_op0)
14628 : {
14629 4 : tree def;
14630 : /* ??? Merge ranges in case of more than one lane. */
14631 4 : if (SLP_TREE_LANES (slp_op0) != 1
14632 0 : || !(def = vect_get_slp_scalar_def (slp_op0, 0))
14633 4 : || !vect_get_range_info (def,
14634 : &op_min_value, &op_max_value))
14635 : break;
14636 : }
14637 4 : else if (!op0
14638 0 : || TREE_CODE (op0) != SSA_NAME
14639 0 : || !SSA_NAME_RANGE_INFO (op0)
14640 4 : || !vect_get_range_info (op0, &op_min_value,
14641 : &op_max_value))
14642 : break;
14643 :
14644 0 : if (cvt_type == NULL_TREE
14645 0 : || (wi::min_precision (op_max_value, SIGNED)
14646 0 : > TYPE_PRECISION (cvt_type))
14647 0 : || (wi::min_precision (op_min_value, SIGNED)
14648 0 : > TYPE_PRECISION (cvt_type)))
14649 0 : continue;
14650 8 : }
14651 :
14652 307 : cvt_type = get_related_vectype_for_scalar_type (TYPE_MODE (vectype_in),
14653 : cvt_type,
14654 : nelts);
14655 : /* This should only happened for SLP as long as loop vectorizer
14656 : only supports same-sized vector. */
14657 482 : if (cvt_type == NULL_TREE
14658 439 : || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nelts)
14659 307 : || !supportable_convert_operation ((tree_code) code1,
14660 : vectype_out,
14661 : cvt_type, &tc1)
14662 515 : || !supportable_convert_operation ((tree_code) code2,
14663 : cvt_type,
14664 : vectype_in, &tc2))
14665 175 : continue;
14666 :
14667 : found_mode = true;
14668 : break;
14669 : }
14670 :
14671 236 : if (found_mode)
14672 : {
14673 132 : converts.safe_push (std::make_pair (cvt_type, tc2));
14674 132 : if (TYPE_MODE (cvt_type) != TYPE_MODE (vectype_out))
14675 132 : converts.safe_push (std::make_pair (vectype_out, tc1));
14676 132 : return true;
14677 : }
14678 : }
14679 : return false;
14680 : }
14681 :
14682 : /* Generate and return a vector mask of MASK_TYPE such that
14683 : mask[I] is true iff J + START_INDEX < END_INDEX for all J <= I.
14684 : Add the statements to SEQ. */
14685 :
14686 : tree
14687 0 : vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index,
14688 : tree end_index, const char *name)
14689 : {
14690 0 : tree cmp_type = TREE_TYPE (start_index);
14691 0 : gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
14692 : cmp_type, mask_type,
14693 : OPTIMIZE_FOR_SPEED));
14694 0 : gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
14695 : start_index, end_index,
14696 : build_zero_cst (mask_type));
14697 0 : tree tmp;
14698 0 : if (name)
14699 0 : tmp = make_temp_ssa_name (mask_type, NULL, name);
14700 : else
14701 0 : tmp = make_ssa_name (mask_type);
14702 0 : gimple_call_set_lhs (call, tmp);
14703 0 : gimple_seq_add_stmt (seq, call);
14704 0 : return tmp;
14705 : }
14706 :
14707 : /* Generate a vector mask of type MASK_TYPE for which index I is false iff
14708 : J + START_INDEX < END_INDEX for all J <= I. Add the statements to SEQ. */
14709 :
14710 : tree
14711 0 : vect_gen_while_not (gimple_seq *seq, tree mask_type, tree start_index,
14712 : tree end_index)
14713 : {
14714 0 : tree tmp = vect_gen_while (seq, mask_type, start_index, end_index);
14715 0 : return gimple_build (seq, BIT_NOT_EXPR, mask_type, tmp);
14716 : }
14717 :
14718 : /* Try to compute the vector types required to vectorize STMT_INFO,
14719 : returning true on success and false if vectorization isn't possible.
14720 : If GROUP_SIZE is nonzero and we're performing BB vectorization,
14721 : take sure that the number of elements in the vectors is no bigger
14722 : than GROUP_SIZE.
14723 :
14724 : On success:
14725 :
14726 : - Set *STMT_VECTYPE_OUT to:
14727 : - NULL_TREE if the statement doesn't need to be vectorized;
14728 : - the equivalent of STMT_VINFO_VECTYPE otherwise.
14729 :
14730 : - Set *NUNITS_VECTYPE_OUT to the vector type that contains the maximum
14731 : number of units needed to vectorize STMT_INFO, or NULL_TREE if the
14732 : statement does not help to determine the overall number of units. */
14733 :
14734 : opt_result
14735 5766336 : vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info,
14736 : tree *stmt_vectype_out,
14737 : tree *nunits_vectype_out,
14738 : unsigned int group_size)
14739 : {
14740 5766336 : gimple *stmt = stmt_info->stmt;
14741 :
14742 : /* For BB vectorization, we should always have a group size once we've
14743 : constructed the SLP tree; the only valid uses of zero GROUP_SIZEs
14744 : are tentative requests during things like early data reference
14745 : analysis and pattern recognition. */
14746 5766336 : if (is_a <bb_vec_info> (vinfo))
14747 4518568 : gcc_assert (vinfo->slp_instances.is_empty () || group_size != 0);
14748 : else
14749 : group_size = 0;
14750 :
14751 5766336 : *stmt_vectype_out = NULL_TREE;
14752 5766336 : *nunits_vectype_out = NULL_TREE;
14753 :
14754 5766336 : if (gimple_get_lhs (stmt) == NULL_TREE
14755 : /* Allow vector conditionals through here. */
14756 2762 : && !is_a <gcond *> (stmt)
14757 : /* MASK_STORE and friends have no lhs, but are ok. */
14758 5771840 : && !(is_gimple_call (stmt)
14759 2762 : && gimple_call_internal_p (stmt)
14760 2742 : && internal_store_fn_p (gimple_call_internal_fn (stmt))))
14761 : {
14762 20 : if (is_a <gcall *> (stmt))
14763 : {
14764 : /* Ignore calls with no lhs. These must be calls to
14765 : #pragma omp simd functions, and what vectorization factor
14766 : it really needs can't be determined until
14767 : vectorizable_simd_clone_call. */
14768 20 : if (dump_enabled_p ())
14769 18 : dump_printf_loc (MSG_NOTE, vect_location,
14770 : "defer to SIMD clone analysis.\n");
14771 20 : return opt_result::success ();
14772 : }
14773 :
14774 0 : return opt_result::failure_at (stmt,
14775 : "not vectorized: irregular stmt: %G", stmt);
14776 : }
14777 :
14778 5766316 : tree vectype;
14779 5766316 : tree scalar_type = NULL_TREE;
14780 5766316 : if (group_size == 0 && STMT_VINFO_VECTYPE (stmt_info))
14781 : {
14782 1578986 : vectype = STMT_VINFO_VECTYPE (stmt_info);
14783 1578986 : if (dump_enabled_p ())
14784 79463 : dump_printf_loc (MSG_NOTE, vect_location,
14785 : "precomputed vectype: %T\n", vectype);
14786 : }
14787 4187330 : else if (vect_use_mask_type_p (stmt_info))
14788 : {
14789 194502 : unsigned int precision = stmt_info->mask_precision;
14790 194502 : scalar_type = build_nonstandard_integer_type (precision, 1);
14791 194502 : vectype = get_mask_type_for_scalar_type (vinfo, scalar_type, group_size);
14792 194502 : if (!vectype)
14793 0 : return opt_result::failure_at (stmt, "not vectorized: unsupported"
14794 : " data-type %T\n", scalar_type);
14795 194502 : if (dump_enabled_p ())
14796 4737 : dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
14797 : }
14798 : else
14799 : {
14800 : /* If we got here with a gcond it means that the target had no available vector
14801 : mode for the scalar type. We can't vectorize so abort. */
14802 3992828 : if (is_a <gcond *> (stmt))
14803 0 : return opt_result::failure_at (stmt,
14804 : "not vectorized:"
14805 : " unsupported data-type for gcond %T\n",
14806 : scalar_type);
14807 :
14808 3992828 : if (data_reference *dr = STMT_VINFO_DATA_REF (stmt_info))
14809 1456736 : scalar_type = TREE_TYPE (DR_REF (dr));
14810 : else
14811 2536092 : scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
14812 :
14813 3992828 : if (dump_enabled_p ())
14814 : {
14815 62318 : if (group_size)
14816 7748 : dump_printf_loc (MSG_NOTE, vect_location,
14817 : "get vectype for scalar type (group size %d):"
14818 : " %T\n", group_size, scalar_type);
14819 : else
14820 54570 : dump_printf_loc (MSG_NOTE, vect_location,
14821 : "get vectype for scalar type: %T\n", scalar_type);
14822 : }
14823 3992828 : vectype = get_vectype_for_scalar_type (vinfo, scalar_type, group_size);
14824 3992828 : if (!vectype)
14825 207005 : return opt_result::failure_at (stmt,
14826 : "not vectorized:"
14827 : " unsupported data-type %T\n",
14828 : scalar_type);
14829 :
14830 3785823 : if (dump_enabled_p ())
14831 62119 : dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", vectype);
14832 : }
14833 :
14834 4059788 : if (scalar_type && VECTOR_MODE_P (TYPE_MODE (scalar_type)))
14835 0 : return opt_result::failure_at (stmt,
14836 : "not vectorized: vector stmt in loop:%G",
14837 : stmt);
14838 :
14839 5559311 : *stmt_vectype_out = vectype;
14840 :
14841 : /* Don't try to compute scalar types if the stmt produces a boolean
14842 : vector; use the existing vector type instead. */
14843 5559311 : tree nunits_vectype = vectype;
14844 5559311 : if (!VECTOR_BOOLEAN_TYPE_P (vectype))
14845 : {
14846 : /* The number of units is set according to the smallest scalar
14847 : type (or the largest vector size, but we only support one
14848 : vector size per vectorization). */
14849 5044044 : scalar_type = vect_get_smallest_scalar_type (stmt_info,
14850 5044044 : TREE_TYPE (vectype));
14851 5044044 : if (!types_compatible_p (scalar_type, TREE_TYPE (vectype)))
14852 : {
14853 982161 : if (dump_enabled_p ())
14854 8345 : dump_printf_loc (MSG_NOTE, vect_location,
14855 : "get vectype for smallest scalar type: %T\n",
14856 : scalar_type);
14857 982161 : nunits_vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
14858 : group_size);
14859 982161 : if (!nunits_vectype)
14860 10 : return opt_result::failure_at
14861 10 : (stmt, "not vectorized: unsupported data-type %T\n",
14862 : scalar_type);
14863 982151 : if (dump_enabled_p ())
14864 8345 : dump_printf_loc (MSG_NOTE, vect_location, "nunits vectype: %T\n",
14865 : nunits_vectype);
14866 : }
14867 : }
14868 :
14869 5559301 : if (!multiple_p (TYPE_VECTOR_SUBPARTS (nunits_vectype),
14870 5559301 : TYPE_VECTOR_SUBPARTS (*stmt_vectype_out)))
14871 0 : return opt_result::failure_at (stmt,
14872 : "Not vectorized: Incompatible number "
14873 : "of vector subparts between %T and %T\n",
14874 : nunits_vectype, *stmt_vectype_out);
14875 :
14876 5559301 : if (dump_enabled_p ())
14877 : {
14878 146319 : dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
14879 146319 : dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (nunits_vectype));
14880 146319 : dump_printf (MSG_NOTE, "\n");
14881 : }
14882 :
14883 5559301 : *nunits_vectype_out = nunits_vectype;
14884 5559301 : return opt_result::success ();
14885 : }
14886 :
14887 : /* Generate and return statement sequence that sets vector length LEN that is:
14888 :
14889 : min_of_start_and_end = min (START_INDEX, END_INDEX);
14890 : left_len = END_INDEX - min_of_start_and_end;
14891 : rhs = min (left_len, LEN_LIMIT);
14892 : LEN = rhs;
14893 :
14894 : Note: the cost of the code generated by this function is modeled
14895 : by vect_estimate_min_profitable_iters, so changes here may need
14896 : corresponding changes there. */
14897 :
14898 : gimple_seq
14899 0 : vect_gen_len (tree len, tree start_index, tree end_index, tree len_limit)
14900 : {
14901 0 : gimple_seq stmts = NULL;
14902 0 : tree len_type = TREE_TYPE (len);
14903 0 : gcc_assert (TREE_TYPE (start_index) == len_type);
14904 :
14905 0 : tree min = gimple_build (&stmts, MIN_EXPR, len_type, start_index, end_index);
14906 0 : tree left_len = gimple_build (&stmts, MINUS_EXPR, len_type, end_index, min);
14907 0 : tree rhs = gimple_build (&stmts, MIN_EXPR, len_type, left_len, len_limit);
14908 0 : gimple* stmt = gimple_build_assign (len, rhs);
14909 0 : gimple_seq_add_stmt (&stmts, stmt);
14910 :
14911 0 : return stmts;
14912 : }
14913 :
|