Line data Source code
1 : /* SLP - Basic Block Vectorization
2 : Copyright (C) 2007-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : and Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #include "config.h"
23 : #define INCLUDE_ALGORITHM
24 : #include "system.h"
25 : #include "coretypes.h"
26 : #include "backend.h"
27 : #include "target.h"
28 : #include "rtl.h"
29 : #include "tree.h"
30 : #include "gimple.h"
31 : #include "tree-pass.h"
32 : #include "ssa.h"
33 : #include "optabs-tree.h"
34 : #include "insn-config.h"
35 : #include "recog.h" /* FIXME: for insn_data */
36 : #include "fold-const.h"
37 : #include "stor-layout.h"
38 : #include "gimple-iterator.h"
39 : #include "cfgloop.h"
40 : #include "tree-vectorizer.h"
41 : #include "langhooks.h"
42 : #include "gimple-walk.h"
43 : #include "dbgcnt.h"
44 : #include "tree-vector-builder.h"
45 : #include "vec-perm-indices.h"
46 : #include "gimple-fold.h"
47 : #include "internal-fn.h"
48 : #include "dump-context.h"
49 : #include "cfganal.h"
50 : #include "tree-eh.h"
51 : #include "tree-cfg.h"
52 : #include "alloc-pool.h"
53 : #include "sreal.h"
54 : #include "predict.h"
55 :
56 : #define REDUC_GROUP_FIRST_ELEMENT(S) \
57 : (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
58 :
59 : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
60 : load_permutation_t &,
61 : const vec<tree> &,
62 : gimple_stmt_iterator *,
63 : poly_uint64, bool, bool,
64 : unsigned *,
65 : unsigned * = nullptr,
66 : bool = false);
67 : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
68 : slp_tree, lane_permutation_t &,
69 : vec<slp_tree> &, bool);
70 : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 : static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
72 :
73 : static object_allocator<_slp_tree> *slp_tree_pool;
74 : static slp_tree slp_first_node;
75 :
76 : void
77 1113436 : vect_slp_init (void)
78 : {
79 1113436 : slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 1113436 : }
81 :
82 : void
83 1113436 : vect_slp_fini (void)
84 : {
85 1774056 : while (slp_first_node)
86 660620 : delete slp_first_node;
87 2226872 : delete slp_tree_pool;
88 1113436 : slp_tree_pool = NULL;
89 1113436 : }
90 :
91 : void *
92 7657062 : _slp_tree::operator new (size_t n)
93 : {
94 7657062 : gcc_assert (n == sizeof (_slp_tree));
95 7657062 : return slp_tree_pool->allocate_raw ();
96 : }
97 :
98 : void
99 7657062 : _slp_tree::operator delete (void *node, size_t n)
100 : {
101 7657062 : gcc_assert (n == sizeof (_slp_tree));
102 7657062 : slp_tree_pool->remove_raw (node);
103 7657062 : }
104 :
105 :
106 : /* Initialize a SLP node. */
107 :
108 7657062 : _slp_tree::_slp_tree ()
109 : {
110 7657062 : this->prev_node = NULL;
111 7657062 : if (slp_first_node)
112 6701228 : slp_first_node->prev_node = this;
113 7657062 : this->next_node = slp_first_node;
114 7657062 : slp_first_node = this;
115 7657062 : SLP_TREE_SCALAR_STMTS (this) = vNULL;
116 7657062 : SLP_TREE_SCALAR_OPS (this) = vNULL;
117 7657062 : SLP_TREE_LIVE_LANES (this) = vNULL;
118 7657062 : SLP_TREE_VEC_DEFS (this) = vNULL;
119 7657062 : SLP_TREE_CHILDREN (this) = vNULL;
120 7657062 : SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
121 7657062 : SLP_TREE_LANE_PERMUTATION (this) = vNULL;
122 7657062 : SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
123 7657062 : SLP_TREE_CODE (this) = ERROR_MARK;
124 7657062 : SLP_TREE_GS_SCALE (this) = 0;
125 7657062 : SLP_TREE_GS_BASE (this) = NULL_TREE;
126 7657062 : this->ldst_lanes = false;
127 7657062 : this->avoid_stlf_fail = false;
128 7657062 : SLP_TREE_VECTYPE (this) = NULL_TREE;
129 7657062 : SLP_TREE_REPRESENTATIVE (this) = NULL;
130 7657062 : this->cycle_info.id = -1;
131 7657062 : this->cycle_info.reduc_idx = -1;
132 7657062 : SLP_TREE_REF_COUNT (this) = 1;
133 7657062 : this->failed = NULL;
134 7657062 : this->max_nunits = 1;
135 7657062 : this->lanes = 0;
136 7657062 : SLP_TREE_TYPE (this) = undef_vec_info_type;
137 7657062 : this->data = NULL;
138 7657062 : }
139 :
140 : /* Tear down a SLP node. */
141 :
142 7657062 : _slp_tree::~_slp_tree ()
143 : {
144 7657062 : if (this->prev_node)
145 4627808 : this->prev_node->next_node = this->next_node;
146 : else
147 3029254 : slp_first_node = this->next_node;
148 7657062 : if (this->next_node)
149 5779109 : this->next_node->prev_node = this->prev_node;
150 7657062 : SLP_TREE_CHILDREN (this).release ();
151 7657062 : SLP_TREE_SCALAR_STMTS (this).release ();
152 7657062 : SLP_TREE_SCALAR_OPS (this).release ();
153 7657062 : SLP_TREE_LIVE_LANES (this).release ();
154 7657062 : SLP_TREE_VEC_DEFS (this).release ();
155 7657062 : SLP_TREE_LOAD_PERMUTATION (this).release ();
156 7657062 : SLP_TREE_LANE_PERMUTATION (this).release ();
157 7657062 : if (this->failed)
158 1982177 : free (failed);
159 7657062 : if (this->data)
160 1234415 : delete this->data;
161 7657062 : }
162 :
163 : /* Push the single SSA definition in DEF to the vector of vector defs. */
164 :
165 : void
166 525123 : _slp_tree::push_vec_def (gimple *def)
167 : {
168 525123 : if (gphi *phi = dyn_cast <gphi *> (def))
169 58537 : vec_defs.quick_push (gimple_phi_result (phi));
170 : else
171 : {
172 466586 : def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
173 466586 : vec_defs.quick_push (get_def_from_ptr (defop));
174 : }
175 525123 : }
176 :
177 : /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
178 :
179 : void
180 14557464 : vect_free_slp_tree (slp_tree node)
181 : {
182 14557464 : int i;
183 14557464 : slp_tree child;
184 :
185 14557464 : if (--SLP_TREE_REF_COUNT (node) != 0)
186 14557464 : return;
187 :
188 10884059 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
189 3887617 : if (child)
190 3534081 : vect_free_slp_tree (child);
191 :
192 6996442 : delete node;
193 : }
194 :
195 : /* Return a location suitable for dumpings related to the SLP instance. */
196 :
197 : dump_user_location_t
198 3376322 : _slp_instance::location () const
199 : {
200 3376322 : if (!root_stmts.is_empty ())
201 316656 : return root_stmts[0]->stmt;
202 : else
203 3059666 : return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
204 : }
205 :
206 :
207 : /* Free the memory allocated for the SLP instance. */
208 :
209 : void
210 1542846 : vect_free_slp_instance (slp_instance instance)
211 : {
212 1542846 : vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
213 1542846 : SLP_INSTANCE_LOADS (instance).release ();
214 1542846 : SLP_INSTANCE_ROOT_STMTS (instance).release ();
215 1542846 : SLP_INSTANCE_REMAIN_DEFS (instance).release ();
216 1542846 : instance->subgraph_entries.release ();
217 1542846 : instance->cost_vec.release ();
218 1542846 : free (instance);
219 1542846 : }
220 :
221 :
222 : /* Create a SLP node with NOPS children with CODE, either VEC_PERM_EXPR
223 : for a permute node or else ERROR_MARK. */
224 :
225 : slp_tree
226 95058 : vect_create_new_slp_node (unsigned nops, tree_code code)
227 : {
228 95058 : gcc_assert (code == ERROR_MARK || code == VEC_PERM_EXPR);
229 95058 : slp_tree node = new _slp_tree;
230 95058 : SLP_TREE_SCALAR_STMTS (node) = vNULL;
231 95058 : SLP_TREE_CHILDREN (node).create (nops);
232 95058 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
233 95058 : SLP_TREE_CODE (node) = code;
234 95058 : return node;
235 : }
236 :
237 : /* Create a SLP node inplace at NODE for SCALAR_STMTS and NOPS children. */
238 :
239 : static slp_tree
240 3741791 : vect_create_new_slp_node (slp_tree node,
241 : vec<stmt_vec_info> scalar_stmts, unsigned nops)
242 : {
243 3741791 : SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
244 3741791 : SLP_TREE_CHILDREN (node).create (nops);
245 3741791 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
246 3741791 : SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
247 3741791 : SLP_TREE_LANES (node) = scalar_stmts.length ();
248 3741791 : return node;
249 : }
250 :
251 : /* Create an SLP node for SCALAR_STMTS and NOPS children. */
252 :
253 : static slp_tree
254 7835 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
255 : {
256 7835 : return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
257 : }
258 :
259 : /* Create a vect_external_def SLP node inplace at NODE for scalar
260 : operands OPS. */
261 :
262 : static slp_tree
263 1827382 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
264 : {
265 1827382 : SLP_TREE_SCALAR_OPS (node) = ops;
266 1827382 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
267 0 : SLP_TREE_LANES (node) = ops.length ();
268 1827382 : return node;
269 : }
270 :
271 : /* Create a vect_external_def SLP node for scalar operands OPS. */
272 :
273 : static slp_tree
274 1827382 : vect_create_new_slp_node (vec<tree> ops)
275 : {
276 1827382 : return vect_create_new_slp_node (new _slp_tree, ops);
277 : }
278 :
279 :
280 : /* This structure is used in creation of an SLP tree. Each instance
281 : corresponds to the same operand in a group of scalar stmts in an SLP
282 : node. */
283 : typedef struct _slp_oprnd_info
284 : {
285 : /* Def-stmts for the operands. */
286 : vec<stmt_vec_info> def_stmts;
287 : /* Operands. */
288 : vec<tree> ops;
289 : /* Information about the first statement, its vector def-type, type, the
290 : operand itself in case it's constant, and an indication if it's a pattern
291 : stmt and gather/scatter info. */
292 : tree first_op_type;
293 : enum vect_def_type first_dt;
294 : bool any_pattern;
295 : bool first_gs_p;
296 : gather_scatter_info first_gs_info;
297 : } *slp_oprnd_info;
298 :
299 :
300 : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
301 : operand. */
302 : static vec<slp_oprnd_info>
303 3313099 : vect_create_oprnd_info (int nops, int group_size)
304 : {
305 3313099 : int i;
306 3313099 : slp_oprnd_info oprnd_info;
307 3313099 : vec<slp_oprnd_info> oprnds_info;
308 :
309 3313099 : oprnds_info.create (nops);
310 11887184 : for (i = 0; i < nops; i++)
311 : {
312 5260986 : oprnd_info = XNEW (struct _slp_oprnd_info);
313 5260986 : oprnd_info->def_stmts.create (group_size);
314 5260986 : oprnd_info->ops.create (group_size);
315 5260986 : oprnd_info->first_dt = vect_uninitialized_def;
316 5260986 : oprnd_info->first_op_type = NULL_TREE;
317 5260986 : oprnd_info->any_pattern = false;
318 5260986 : oprnd_info->first_gs_p = false;
319 5260986 : oprnds_info.quick_push (oprnd_info);
320 : }
321 :
322 3313099 : return oprnds_info;
323 : }
324 :
325 :
326 : /* Free operands info. */
327 :
328 : static void
329 3313099 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
330 : {
331 3313099 : int i;
332 3313099 : slp_oprnd_info oprnd_info;
333 :
334 8574085 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
335 : {
336 5260986 : oprnd_info->def_stmts.release ();
337 5260986 : oprnd_info->ops.release ();
338 5260986 : XDELETE (oprnd_info);
339 : }
340 :
341 3313099 : oprnds_info.release ();
342 3313099 : }
343 :
344 : /* Return the execution frequency of NODE (so that a higher value indicates
345 : a "more important" node when optimizing for speed). */
346 :
347 : static sreal
348 3467835 : vect_slp_node_weight (slp_tree node)
349 : {
350 3467835 : stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
351 3467835 : basic_block bb = gimple_bb (stmt_info->stmt);
352 3467835 : return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
353 : }
354 :
355 : /* Return true if STMTS contains a pattern statement. */
356 :
357 : static bool
358 22190 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
359 : {
360 22190 : stmt_vec_info stmt_info;
361 22190 : unsigned int i;
362 71916 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
363 51903 : if (stmt_info && is_pattern_stmt_p (stmt_info))
364 : return true;
365 : return false;
366 : }
367 :
368 : /* Return true when all lanes in the external or constant NODE have
369 : the same value. */
370 :
371 : static bool
372 589312 : vect_slp_tree_uniform_p (slp_tree node)
373 : {
374 589312 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
375 : || SLP_TREE_DEF_TYPE (node) == vect_external_def);
376 :
377 : /* Pre-exsting vectors. */
378 1037324 : if (SLP_TREE_SCALAR_OPS (node).is_empty ())
379 : return false;
380 :
381 : unsigned i;
382 : tree op, first = NULL_TREE;
383 1349759 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
384 1208459 : if (!first)
385 : first = op;
386 619147 : else if (!operand_equal_p (first, op, 0))
387 : return false;
388 :
389 : return true;
390 : }
391 :
392 : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
393 : that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
394 : of the chain. */
395 :
396 : int
397 698766 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
398 : stmt_vec_info first_stmt_info)
399 : {
400 698766 : stmt_vec_info next_stmt_info = first_stmt_info;
401 698766 : int result = 0;
402 :
403 698766 : if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
404 : return -1;
405 :
406 1747627 : do
407 : {
408 1747627 : if (next_stmt_info == stmt_info)
409 : return result;
410 1048861 : next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
411 1048861 : if (next_stmt_info)
412 1048861 : result += DR_GROUP_GAP (next_stmt_info);
413 : }
414 1048861 : while (next_stmt_info);
415 :
416 : return -1;
417 : }
418 :
419 : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
420 : using the method implemented by duplicate_and_interleave. Return true
421 : if so, returning the number of intermediate vectors in *NVECTORS_OUT
422 : (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
423 : (if nonnull). */
424 :
425 : bool
426 0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
427 : tree elt_type, unsigned int *nvectors_out,
428 : tree *vector_type_out,
429 : tree *permutes)
430 : {
431 0 : tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
432 0 : if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
433 0 : return false;
434 :
435 0 : machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
436 0 : poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
437 0 : unsigned int nvectors = 1;
438 0 : for (;;)
439 : {
440 0 : scalar_int_mode int_mode;
441 0 : poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
442 0 : if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
443 : {
444 : /* Get the natural vector type for this SLP group size. */
445 0 : tree int_type = build_nonstandard_integer_type
446 0 : (GET_MODE_BITSIZE (int_mode), 1);
447 0 : tree vector_type
448 0 : = get_vectype_for_scalar_type (vinfo, int_type, count);
449 0 : poly_int64 half_nelts;
450 0 : if (vector_type
451 0 : && VECTOR_MODE_P (TYPE_MODE (vector_type))
452 0 : && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
453 : GET_MODE_SIZE (base_vector_mode))
454 0 : && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
455 : 2, &half_nelts))
456 : {
457 : /* Try fusing consecutive sequences of COUNT / NVECTORS elements
458 : together into elements of type INT_TYPE and using the result
459 : to build NVECTORS vectors. */
460 0 : poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
461 0 : vec_perm_builder sel1 (nelts, 2, 3);
462 0 : vec_perm_builder sel2 (nelts, 2, 3);
463 :
464 0 : for (unsigned int i = 0; i < 3; ++i)
465 : {
466 0 : sel1.quick_push (i);
467 0 : sel1.quick_push (i + nelts);
468 0 : sel2.quick_push (half_nelts + i);
469 0 : sel2.quick_push (half_nelts + i + nelts);
470 : }
471 0 : vec_perm_indices indices1 (sel1, 2, nelts);
472 0 : vec_perm_indices indices2 (sel2, 2, nelts);
473 0 : machine_mode vmode = TYPE_MODE (vector_type);
474 0 : if (can_vec_perm_const_p (vmode, vmode, indices1)
475 0 : && can_vec_perm_const_p (vmode, vmode, indices2))
476 : {
477 0 : if (nvectors_out)
478 0 : *nvectors_out = nvectors;
479 0 : if (vector_type_out)
480 0 : *vector_type_out = vector_type;
481 0 : if (permutes)
482 : {
483 0 : permutes[0] = vect_gen_perm_mask_checked (vector_type,
484 : indices1);
485 0 : permutes[1] = vect_gen_perm_mask_checked (vector_type,
486 : indices2);
487 : }
488 0 : return true;
489 : }
490 0 : }
491 : }
492 0 : if (!multiple_p (elt_bytes, 2, &elt_bytes))
493 : return false;
494 0 : nvectors *= 2;
495 : /* We need to be able to fuse COUNT / NVECTORS elements together. */
496 0 : if (!multiple_p (count, nvectors))
497 : return false;
498 : }
499 : }
500 :
501 : /* Return true if DTA and DTB match. */
502 :
503 : static bool
504 16983229 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
505 : {
506 16983229 : return (dta == dtb
507 347959 : || ((dta == vect_external_def || dta == vect_constant_def)
508 215860 : && (dtb == vect_external_def || dtb == vect_constant_def)));
509 : }
510 :
511 : #define GATHER_SCATTER_OFFSET (-3)
512 :
513 : /* For most SLP statements, there is a one-to-one mapping between
514 : gimple arguments and child nodes. If that is not true for STMT,
515 : return an array that contains:
516 :
517 : - the number of child nodes, followed by
518 : - for each child node, the index of the argument associated with that node.
519 : The special index -1 is the first operand of an embedded comparison and
520 : the special index -2 is the second operand of an embedded comparison.
521 : The special indes -3 is the offset of a gather as analyzed by
522 : vect_check_gather_scatter.
523 :
524 : SWAP is as for vect_get_and_check_slp_defs. */
525 :
526 : static const int *
527 24140575 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p,
528 : unsigned char swap)
529 : {
530 24140575 : static const int no_arg_map[] = { 0 };
531 24140575 : static const int arg0_map[] = { 1, 0 };
532 24140575 : static const int arg2_map[] = { 1, 2 };
533 24140575 : static const int arg2_arg3_map[] = { 2, 2, 3 };
534 24140575 : static const int arg2_arg4_map[] = { 2, 2, 4 };
535 24140575 : static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 };
536 24140575 : static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 };
537 24140575 : static const int arg3_arg2_map[] = { 2, 3, 2 };
538 24140575 : static const int op00_map[] = { 1, -1 };
539 24140575 : static const int op1_op0_map[] = { 2, 1, 0 };
540 24140575 : static const int off_map[] = { 1, GATHER_SCATTER_OFFSET };
541 24140575 : static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 };
542 24140575 : static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 };
543 24140575 : static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 };
544 24140575 : static const int mask_call_maps[6][7] = {
545 : { 1, 1, },
546 : { 2, 1, 2, },
547 : { 3, 1, 2, 3, },
548 : { 4, 1, 2, 3, 4, },
549 : { 5, 1, 2, 3, 4, 5, },
550 : { 6, 1, 2, 3, 4, 5, 6 },
551 : };
552 :
553 24140575 : gcc_checking_assert (!swap
554 : || !is_gimple_assign (stmt)
555 : || TREE_CODE_CLASS
556 : (gimple_assign_rhs_code (stmt)) == tcc_comparison
557 : || commutative_tree_code
558 : (gimple_assign_rhs_code (stmt)));
559 :
560 24140575 : if (auto assign = dyn_cast<const gassign *> (stmt))
561 : {
562 22696952 : tree_code code = gimple_assign_rhs_code (assign);
563 22696952 : if (code == COND_EXPR
564 22696952 : && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
565 0 : gcc_unreachable ();
566 22696952 : else if ((TREE_CODE_CLASS (code) == tcc_comparison
567 21360865 : || commutative_tree_code (code))
568 31623680 : && swap)
569 : return op1_op0_map;
570 22656262 : else if (code == VIEW_CONVERT_EXPR)
571 : return op00_map;
572 22648111 : else if (gather_scatter_p)
573 43313 : return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
574 43313 : ? off_op0_map : off_map);
575 : }
576 1443623 : else if (auto call = dyn_cast<const gcall *> (stmt))
577 : {
578 160784 : if (gimple_call_internal_p (call))
579 92028 : switch (gimple_call_internal_fn (call))
580 : {
581 15940 : case IFN_MASK_LOAD:
582 27186 : return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
583 :
584 : case IFN_GATHER_LOAD:
585 : return arg2_map;
586 :
587 0 : case IFN_MASK_GATHER_LOAD:
588 0 : case IFN_MASK_LEN_GATHER_LOAD:
589 0 : return arg2_arg5_arg6_map;
590 :
591 0 : case IFN_SCATTER_STORE:
592 0 : return arg2_arg4_map;
593 :
594 0 : case IFN_MASK_SCATTER_STORE:
595 0 : case IFN_MASK_LEN_SCATTER_STORE:
596 0 : return arg2_arg4_arg5_map;
597 :
598 9481 : case IFN_MASK_STORE:
599 17540 : return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
600 :
601 988 : case IFN_MASK_CALL:
602 988 : {
603 988 : unsigned nargs = gimple_call_num_args (call);
604 988 : if (nargs >= 2 && nargs <= 7)
605 988 : return mask_call_maps[nargs-2];
606 : else
607 : return nullptr;
608 : }
609 :
610 278 : case IFN_CLZ:
611 278 : case IFN_CTZ:
612 278 : return arg0_map;
613 :
614 6306 : case IFN_GOMP_SIMD_LANE:
615 6306 : return no_arg_map;
616 :
617 : default:
618 : break;
619 : }
620 : }
621 : return nullptr;
622 : }
623 :
624 : static const int *
625 24124649 : vect_get_operand_map (const stmt_vec_info stmt, unsigned char swap = 0)
626 : {
627 0 : return vect_get_operand_map (stmt->stmt, STMT_VINFO_GATHER_SCATTER_P (stmt),
628 0 : swap);
629 : }
630 :
631 : /* Return the SLP node child index for operand OP of STMT. */
632 :
633 : int
634 1365551 : vect_slp_child_index_for_operand (const stmt_vec_info stmt, int op)
635 : {
636 1365551 : const int *opmap = vect_get_operand_map (stmt);
637 1365551 : if (!opmap)
638 : return op;
639 21863 : for (int i = 1; i < 1 + opmap[0]; ++i)
640 21863 : if (opmap[i] == op)
641 12246 : return i - 1;
642 0 : gcc_unreachable ();
643 : }
644 :
645 : /* Helper class for mapping of GIMPLE operands to SLP children. */
646 : /* ??? Add vect_slp_child_index_for_operand here and amend opmaps
647 : with the full reverse mapping and indicating the position of the
648 : first commutative operand index, eliding the swap_p argument from
649 : vect_get_operand_map. Adjust all consumers. */
650 :
651 : struct slp_oprnds {
652 : slp_oprnds (stmt_vec_info);
653 : tree get_op_for_slp_child (stmt_vec_info, unsigned);
654 : const int *opmap;
655 : const unsigned int num_slp_children;
656 : };
657 :
658 4373915 : slp_oprnds::slp_oprnds (stmt_vec_info stmt_info)
659 4373915 : : opmap (vect_get_operand_map (stmt_info)),
660 4373915 : num_slp_children (opmap ? opmap[0] : gimple_num_args (stmt_info->stmt))
661 : {
662 4373915 : }
663 :
664 : /* For SLP child number N get the corresponding tree operand from GIMPLE
665 : statement described by STMT_INFO. */
666 :
667 : tree
668 4818223 : slp_oprnds::get_op_for_slp_child (stmt_vec_info stmt_info, unsigned n)
669 : {
670 4818223 : gcc_assert (n < num_slp_children);
671 4818223 : int opno = opmap ? opmap[n + 1] : (int) n;
672 4818223 : if (opno == GATHER_SCATTER_OFFSET)
673 0 : gcc_unreachable (); // TODO
674 4818223 : else if (opno < 0)
675 1934 : return TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
676 : else
677 4816289 : return gimple_arg (stmt_info->stmt, opno);
678 : }
679 :
680 : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
681 : they are of a valid type and that they match the defs of the first stmt of
682 : the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
683 : by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
684 : indicates swap is required for cond_expr stmts. Specifically, SWAP
685 : is 1 if STMT is cond and operands of comparison need to be swapped;
686 : SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
687 :
688 : If there was a fatal error return -1; if the error could be corrected by
689 : swapping operands of father node of this one, return 1; if everything is
690 : ok return 0. */
691 : static int
692 12672511 : vect_get_and_check_slp_defs (vec_info *vinfo, tree vectype, unsigned char swap,
693 : bool *skip_args,
694 : vec<stmt_vec_info> stmts, unsigned stmt_num,
695 : vec<slp_oprnd_info> *oprnds_info)
696 : {
697 12672511 : stmt_vec_info stmt_info = stmts[stmt_num];
698 12672511 : tree oprnd;
699 12672511 : unsigned int i, number_of_oprnds;
700 12672511 : enum vect_def_type dt = vect_uninitialized_def;
701 12672511 : slp_oprnd_info oprnd_info;
702 12672511 : gather_scatter_info gs_info;
703 12672511 : unsigned int gs_op = -1u;
704 12672511 : unsigned int commutative_op = -1U;
705 12672511 : bool first = stmt_num == 0;
706 :
707 12672511 : if (!stmt_info)
708 : {
709 0 : for (auto oi : *oprnds_info)
710 : {
711 0 : oi->def_stmts.quick_push (NULL);
712 0 : oi->ops.quick_push (NULL_TREE);
713 : }
714 : return 0;
715 : }
716 :
717 12672511 : if (!is_a<gcall *> (stmt_info->stmt)
718 : && !is_a<gassign *> (stmt_info->stmt)
719 : && !is_a<gphi *> (stmt_info->stmt))
720 : return -1;
721 :
722 12672511 : number_of_oprnds = gimple_num_args (stmt_info->stmt);
723 12672511 : const int *map = vect_get_operand_map (stmt_info, swap);
724 12672511 : if (map)
725 75842 : number_of_oprnds = *map++;
726 12672511 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
727 : {
728 49213 : if (gimple_call_internal_p (stmt))
729 : {
730 32558 : internal_fn ifn = gimple_call_internal_fn (stmt);
731 32558 : commutative_op = first_commutative_argument (ifn);
732 32558 : if (internal_gather_scatter_fn_p (ifn))
733 : {
734 0 : vect_describe_gather_scatter_call
735 0 : (stmt_info,
736 0 : first ? &(*oprnds_info)[0]->first_gs_info : &gs_info);
737 0 : if (first)
738 0 : (*oprnds_info)[0]->first_gs_p = true;
739 : gs_op = 0;
740 : }
741 : }
742 : }
743 12623298 : else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
744 : {
745 14725049 : if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
746 8365245 : commutative_op = 0;
747 : }
748 :
749 12672511 : bool swapped = (swap != 0);
750 12672511 : bool backedge = false;
751 12672511 : enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
752 35068579 : for (i = 0; i < number_of_oprnds; i++)
753 : {
754 22397247 : oprnd_info = (*oprnds_info)[i];
755 22397247 : int opno = map ? map[i] : int (i);
756 22397247 : if (opno == GATHER_SCATTER_OFFSET)
757 : {
758 22734 : gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
759 22734 : if (!is_a <loop_vec_info> (vinfo)
760 22734 : || !vect_check_gather_scatter (stmt_info, vectype,
761 : as_a <loop_vec_info> (vinfo),
762 : first ? &oprnd_info->first_gs_info
763 : : &gs_info))
764 1179 : return -1;
765 :
766 22734 : if (first)
767 : {
768 22483 : oprnd_info->first_gs_p = true;
769 22483 : oprnd = oprnd_info->first_gs_info.offset;
770 : }
771 : else
772 : {
773 251 : gs_op = i;
774 251 : oprnd = gs_info.offset;
775 : }
776 : }
777 22374513 : else if (opno < 0)
778 2842 : oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
779 : else
780 : {
781 22371671 : oprnd = gimple_arg (stmt_info->stmt, opno);
782 22371671 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
783 : {
784 1206443 : edge e = gimple_phi_arg_edge (stmt, opno);
785 2412886 : backedge = (is_a <bb_vec_info> (vinfo)
786 1863146 : ? e->flags & EDGE_DFS_BACK
787 656703 : : dominated_by_p (CDI_DOMINATORS, e->src,
788 656703 : gimple_bb (stmt_info->stmt)));
789 : }
790 : }
791 :
792 22397247 : stmt_vec_info def_stmt_info;
793 22397247 : if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
794 : {
795 994 : if (dump_enabled_p ())
796 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
797 : "Build SLP failed: can't analyze def for %T\n",
798 : oprnd);
799 :
800 994 : return -1;
801 : }
802 :
803 22396253 : if (skip_args[i])
804 : {
805 522657 : oprnd_info->def_stmts.quick_push (NULL);
806 522657 : oprnd_info->ops.quick_push (NULL_TREE);
807 522657 : oprnd_info->first_dt = vect_uninitialized_def;
808 522657 : continue;
809 : }
810 :
811 21873596 : oprnd_info->def_stmts.quick_push (def_stmt_info);
812 21873596 : oprnd_info->ops.quick_push (oprnd);
813 :
814 21873596 : if (def_stmt_info
815 21873596 : && is_pattern_stmt_p (def_stmt_info))
816 : {
817 393810 : if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
818 : != def_stmt_info)
819 278193 : oprnd_info->any_pattern = true;
820 : else
821 : /* If we promote this to external use the original stmt def. */
822 115617 : oprnd_info->ops.last ()
823 231234 : = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
824 : }
825 :
826 : /* If there's a extern def on a backedge make sure we can
827 : code-generate at the region start.
828 : ??? This is another case that could be fixed by adjusting
829 : how we split the function but at the moment we'd have conflicting
830 : goals there. */
831 21873596 : if (backedge
832 166850 : && dts[i] == vect_external_def
833 206 : && is_a <bb_vec_info> (vinfo)
834 206 : && TREE_CODE (oprnd) == SSA_NAME
835 185 : && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
836 21873781 : && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
837 185 : gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
838 : {
839 185 : if (dump_enabled_p ())
840 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
841 : "Build SLP failed: extern def %T only defined "
842 : "on backedge\n", oprnd);
843 185 : return -1;
844 : }
845 :
846 21873411 : if (first)
847 : {
848 4776118 : tree type = TREE_TYPE (oprnd);
849 4776118 : dt = dts[i];
850 :
851 : /* For the swapping logic below force vect_reduction_def
852 : for the reduction op in a SLP reduction group. */
853 4776118 : if (!STMT_VINFO_DATA_REF (stmt_info)
854 3615225 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
855 5210 : && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
856 4778695 : && def_stmt_info)
857 2577 : dts[i] = dt = vect_reduction_def;
858 :
859 : /* Check the types of the definition. */
860 4776118 : switch (dt)
861 : {
862 4776118 : case vect_external_def:
863 4776118 : case vect_constant_def:
864 4776118 : case vect_internal_def:
865 4776118 : case vect_reduction_def:
866 4776118 : case vect_double_reduction_def:
867 4776118 : case vect_induction_def:
868 4776118 : case vect_nested_cycle:
869 4776118 : case vect_first_order_recurrence:
870 4776118 : break;
871 :
872 0 : default:
873 : /* FORNOW: Not supported. */
874 0 : if (dump_enabled_p ())
875 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
876 : "Build SLP failed: illegal type of def %T\n",
877 : oprnd);
878 0 : return -1;
879 : }
880 :
881 4776118 : oprnd_info->first_dt = dt;
882 4776118 : oprnd_info->first_op_type = type;
883 : }
884 : }
885 12671332 : if (first)
886 : return 0;
887 :
888 : /* Now match the operand definition types to that of the first stmt. */
889 26202422 : for (i = 0; i < number_of_oprnds;)
890 : {
891 17109451 : if (skip_args[i])
892 : {
893 43200 : ++i;
894 43200 : continue;
895 : }
896 :
897 17066251 : oprnd_info = (*oprnds_info)[i];
898 17066251 : dt = dts[i];
899 17066251 : stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
900 17066251 : oprnd = oprnd_info->ops[stmt_num];
901 17066251 : tree type = TREE_TYPE (oprnd);
902 :
903 17066251 : if (!types_compatible_p (oprnd_info->first_op_type, type))
904 : {
905 88818 : if (dump_enabled_p ())
906 109 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
907 : "Build SLP failed: different operand types\n");
908 88818 : return 1;
909 : }
910 :
911 16977433 : if ((gs_op == i) != oprnd_info->first_gs_p)
912 : {
913 0 : if (dump_enabled_p ())
914 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915 : "Build SLP failed: mixed gather and non-gather\n");
916 0 : return 1;
917 : }
918 16977433 : else if (gs_op == i)
919 : {
920 221 : if (!operand_equal_p (oprnd_info->first_gs_info.base,
921 221 : gs_info.base))
922 : {
923 16 : if (dump_enabled_p ())
924 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
925 : "Build SLP failed: different gather base\n");
926 16 : return 1;
927 : }
928 205 : if (oprnd_info->first_gs_info.scale != gs_info.scale)
929 : {
930 8 : if (dump_enabled_p ())
931 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 : "Build SLP failed: different gather scale\n");
933 8 : return 1;
934 : }
935 : }
936 :
937 : /* Not first stmt of the group, check that the def-stmt/s match
938 : the def-stmt/s of the first stmt. Allow different definition
939 : types for reduction chains: the first stmt must be a
940 : vect_reduction_def (a phi node), and the rest
941 : end in the reduction chain. */
942 16977409 : if ((!vect_def_types_match (oprnd_info->first_dt, dt)
943 291820 : && !(oprnd_info->first_dt == vect_reduction_def
944 4535 : && !STMT_VINFO_DATA_REF (stmt_info)
945 4535 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
946 4509 : && def_stmt_info
947 4509 : && !STMT_VINFO_DATA_REF (def_stmt_info)
948 4509 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
949 : == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
950 16690098 : || (!STMT_VINFO_DATA_REF (stmt_info)
951 15389718 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
952 9386 : && ((!def_stmt_info
953 9217 : || STMT_VINFO_DATA_REF (def_stmt_info)
954 16906 : || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
955 : != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
956 9386 : != (oprnd_info->first_dt != vect_reduction_def))))
957 : {
958 : /* Try swapping operands if we got a mismatch. For BB
959 : vectorization only in case it will clearly improve things. */
960 289730 : if (i == commutative_op && !swapped
961 287311 : && (!is_a <bb_vec_info> (vinfo)
962 4576 : || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
963 4576 : dts[i+1])
964 1094 : && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
965 : || vect_def_types_match
966 150 : ((*oprnds_info)[i+1]->first_dt, dts[i])))))
967 : {
968 2419 : if (dump_enabled_p ())
969 144 : dump_printf_loc (MSG_NOTE, vect_location,
970 : "trying swapped operands\n");
971 2419 : std::swap (dts[i], dts[i+1]);
972 2419 : std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
973 2419 : (*oprnds_info)[i+1]->def_stmts[stmt_num]);
974 2419 : std::swap ((*oprnds_info)[i]->ops[stmt_num],
975 2419 : (*oprnds_info)[i+1]->ops[stmt_num]);
976 : /* After swapping some operands we lost track whether an
977 : operand has any pattern defs so be conservative here. */
978 2419 : if ((*oprnds_info)[i]->any_pattern
979 2419 : || (*oprnds_info)[i+1]->any_pattern)
980 36 : (*oprnds_info)[i]->any_pattern
981 18 : = (*oprnds_info)[i+1]->any_pattern = true;
982 2419 : swapped = true;
983 2419 : continue;
984 : }
985 :
986 284892 : if (is_a <bb_vec_info> (vinfo)
987 269494 : && !oprnd_info->any_pattern
988 554148 : && number_of_oprnds > 1)
989 : {
990 : /* Now for commutative ops we should see whether we can
991 : make the other operand matching. */
992 103439 : if (dump_enabled_p ())
993 203 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
994 : "treating operand as external\n");
995 103439 : oprnd_info->first_dt = dt = vect_external_def;
996 : }
997 : else
998 : {
999 181453 : if (dump_enabled_p ())
1000 407 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1001 : "Build SLP failed: different types\n");
1002 181453 : return 1;
1003 : }
1004 : }
1005 :
1006 : /* Make sure to demote the overall operand to external. */
1007 16793537 : if (dt == vect_external_def)
1008 331942 : oprnd_info->first_dt = vect_external_def;
1009 : /* For a SLP reduction chain we want to duplicate the reduction to
1010 : each of the chain members. That gets us a sane SLP graph (still
1011 : the stmts are not 100% correct wrt the initial values). */
1012 16461595 : else if ((dt == vect_internal_def
1013 16461595 : || dt == vect_reduction_def)
1014 15541210 : && oprnd_info->first_dt == vect_reduction_def
1015 100842 : && !STMT_VINFO_DATA_REF (stmt_info)
1016 100842 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
1017 4509 : && !STMT_VINFO_DATA_REF (def_stmt_info)
1018 16466104 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
1019 : == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
1020 : {
1021 4509 : oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
1022 4509 : oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
1023 : }
1024 :
1025 16793537 : ++i;
1026 : }
1027 :
1028 : /* Swap operands. */
1029 9092971 : if (swapped)
1030 : {
1031 40748 : if (dump_enabled_p ())
1032 430 : dump_printf_loc (MSG_NOTE, vect_location,
1033 : "swapped operands to match def types in %G",
1034 : stmt_info->stmt);
1035 : }
1036 :
1037 : return 0;
1038 : }
1039 :
1040 : /* Return true if call statements CALL1 and CALL2 are similar enough
1041 : to be combined into the same SLP group. */
1042 :
1043 : bool
1044 21106 : compatible_calls_p (gcall *call1, gcall *call2, bool allow_two_operators)
1045 : {
1046 21106 : unsigned int nargs = gimple_call_num_args (call1);
1047 21106 : if (nargs != gimple_call_num_args (call2))
1048 : return false;
1049 :
1050 19170 : auto cfn1 = gimple_call_combined_fn (call1);
1051 19170 : auto cfn2 = gimple_call_combined_fn (call2);
1052 19170 : if (cfn1 != cfn2
1053 2 : && (!allow_two_operators
1054 2 : || !((cfn1 == CFN_FMA || cfn1 == CFN_FMS)
1055 2 : && (cfn2 == CFN_FMA || cfn2 == CFN_FMS))))
1056 : return false;
1057 :
1058 19170 : if (gimple_call_internal_p (call1))
1059 : {
1060 6997 : if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
1061 6997 : TREE_TYPE (gimple_call_lhs (call2))))
1062 : return false;
1063 14372 : for (unsigned int i = 0; i < nargs; ++i)
1064 7375 : if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
1065 7375 : TREE_TYPE (gimple_call_arg (call2, i))))
1066 : return false;
1067 : }
1068 : else
1069 : {
1070 12173 : if (!operand_equal_p (gimple_call_fn (call1),
1071 12173 : gimple_call_fn (call2), 0))
1072 : return false;
1073 :
1074 26787 : if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
1075 : return false;
1076 : }
1077 :
1078 : /* Check that any unvectorized arguments are equal. */
1079 15926 : if (const int *map = vect_get_operand_map (call1, false, false))
1080 : {
1081 15 : unsigned int nkept = *map++;
1082 15 : unsigned int mapi = 0;
1083 57 : for (unsigned int i = 0; i < nargs; ++i)
1084 42 : if (mapi < nkept && map[mapi] == int (i))
1085 27 : mapi += 1;
1086 15 : else if (!operand_equal_p (gimple_call_arg (call1, i),
1087 15 : gimple_call_arg (call2, i)))
1088 : return false;
1089 : }
1090 :
1091 : return true;
1092 : }
1093 :
1094 : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1095 : caller's attempt to find the vector type in STMT_INFO with the narrowest
1096 : element type. Return true if VECTYPE is nonnull and if it is valid
1097 : for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1098 : number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1099 : vect_build_slp_tree. */
1100 :
1101 : static bool
1102 5457928 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1103 : unsigned int group_size,
1104 : tree vectype, poly_uint64 *max_nunits)
1105 : {
1106 5457928 : if (!vectype)
1107 : {
1108 3874 : if (dump_enabled_p ())
1109 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1110 : "Build SLP failed: unsupported data-type in %G\n",
1111 : stmt_info->stmt);
1112 : /* Fatal mismatch. */
1113 3874 : return false;
1114 : }
1115 :
1116 : /* If populating the vector type requires unrolling then fail
1117 : before adjusting *max_nunits for basic-block vectorization. */
1118 5454054 : if (is_a <bb_vec_info> (vinfo)
1119 5454054 : && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1120 : {
1121 140850 : if (dump_enabled_p ())
1122 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123 : "Build SLP failed: unrolling required "
1124 : "in basic block SLP\n");
1125 : /* Fatal mismatch. */
1126 140850 : return false;
1127 : }
1128 :
1129 : /* In case of multiple types we need to detect the smallest type. */
1130 5313204 : vect_update_max_nunits (max_nunits, vectype);
1131 5313204 : return true;
1132 : }
1133 :
1134 : /* Verify if the scalar stmts STMTS are isomorphic, require data
1135 : permutation or are of unsupported types of operation. Return
1136 : true if they are, otherwise return false and indicate in *MATCHES
1137 : which stmts are not isomorphic to the first one. If MATCHES[0]
1138 : is false then this indicates the comparison could not be
1139 : carried out or the stmts will never be vectorized by SLP.
1140 :
1141 : Note COND_EXPR is possibly isomorphic to another one after swapping its
1142 : operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1143 : the first stmt by swapping the two operands of comparison; set SWAP[i]
1144 : to 2 if stmt I is isormorphic to the first stmt by inverting the code
1145 : of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1146 : to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1147 :
1148 : static bool
1149 5708792 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1150 : vec<stmt_vec_info> stmts, unsigned int group_size,
1151 : poly_uint64 *max_nunits, bool *matches,
1152 : bool *two_operators, tree *node_vectype)
1153 : {
1154 5708792 : unsigned int i;
1155 5708792 : stmt_vec_info first_stmt_info = stmts[0];
1156 5708792 : code_helper first_stmt_code = ERROR_MARK;
1157 5708792 : code_helper alt_stmt_code = ERROR_MARK;
1158 5708792 : code_helper first_cond_code = ERROR_MARK;
1159 5708792 : bool need_same_oprnds = false;
1160 5708792 : tree first_lhs = NULL_TREE;
1161 5708792 : tree first_op1 = NULL_TREE;
1162 5708792 : stmt_vec_info first_load = NULL, prev_first_load = NULL;
1163 5708792 : bool first_stmt_ldst_p = false, first_stmt_ldst_masklen_p = false;
1164 5708792 : bool first_stmt_phi_p = false;
1165 5708792 : int first_reduc_idx = -1;
1166 5708792 : bool maybe_soft_fail = false;
1167 5708792 : tree soft_fail_nunits_vectype = NULL_TREE;
1168 :
1169 5708792 : tree vectype, nunits_vectype;
1170 5708792 : if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
1171 : &nunits_vectype, group_size))
1172 : {
1173 : /* Fatal mismatch. */
1174 199563 : matches[0] = false;
1175 199563 : return false;
1176 : }
1177 5509229 : if (is_a <bb_vec_info> (vinfo)
1178 5509229 : && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
1179 : {
1180 349296 : if (dump_enabled_p ())
1181 296 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1182 : "Build SLP failed: not using single lane "
1183 : "vector type %T\n", vectype);
1184 349296 : matches[0] = false;
1185 349296 : return false;
1186 : }
1187 : /* Record nunits required but continue analysis, producing matches[]
1188 : as if nunits was not an issue. This allows splitting of groups
1189 : to happen. */
1190 5159933 : if (nunits_vectype
1191 5159933 : && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
1192 : nunits_vectype, max_nunits))
1193 : {
1194 140850 : gcc_assert (is_a <bb_vec_info> (vinfo));
1195 140850 : maybe_soft_fail = true;
1196 140850 : soft_fail_nunits_vectype = nunits_vectype;
1197 : }
1198 :
1199 5159933 : gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
1200 5159933 : *node_vectype = vectype;
1201 :
1202 : /* For every stmt in NODE find its def stmt/s. */
1203 5159933 : stmt_vec_info stmt_info;
1204 22071911 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1205 : {
1206 17074432 : bool ldst_p = false;
1207 17074432 : bool ldst_masklen_p = false;
1208 17074432 : bool phi_p = false;
1209 17074432 : code_helper rhs_code = ERROR_MARK;
1210 :
1211 17074432 : swap[i] = 0;
1212 17074432 : matches[i] = false;
1213 17074432 : if (!stmt_info)
1214 : {
1215 40264 : matches[i] = true;
1216 16952242 : continue;
1217 : }
1218 :
1219 17034168 : gimple *stmt = stmt_info->stmt;
1220 17034168 : if (dump_enabled_p ())
1221 218212 : dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1222 :
1223 : /* Fail to vectorize statements marked as unvectorizable, throw
1224 : or are volatile. */
1225 17034168 : if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1226 16845033 : || stmt_can_throw_internal (cfun, stmt)
1227 33095661 : || gimple_has_volatile_ops (stmt))
1228 : {
1229 194630 : if (dump_enabled_p ())
1230 199 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1231 : "Build SLP failed: unvectorizable statement %G",
1232 : stmt);
1233 : /* ??? For BB vectorization we want to commutate operands in a way
1234 : to shuffle all unvectorizable defs into one operand and have
1235 : the other still vectorized. The following doesn't reliably
1236 : work for this though but it's the easiest we can do here. */
1237 194630 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1238 63820 : continue;
1239 : /* Fatal mismatch. */
1240 130810 : matches[0] = false;
1241 130810 : return false;
1242 : }
1243 :
1244 16839538 : gcall *call_stmt = dyn_cast <gcall *> (stmt);
1245 16839538 : tree lhs = gimple_get_lhs (stmt);
1246 16839538 : if (lhs == NULL_TREE && !call_stmt)
1247 : {
1248 36 : if (dump_enabled_p ())
1249 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 : "Build SLP failed: not GIMPLE_ASSIGN nor "
1251 : "GIMPLE_CALL %G", stmt);
1252 36 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1253 36 : continue;
1254 : /* Fatal mismatch. */
1255 0 : matches[0] = false;
1256 0 : return false;
1257 : }
1258 :
1259 16839502 : if (call_stmt)
1260 : {
1261 102339 : combined_fn cfn = gimple_call_combined_fn (call_stmt);
1262 102339 : if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1263 58546 : rhs_code = cfn;
1264 : else
1265 : rhs_code = CALL_EXPR;
1266 :
1267 102339 : if (cfn == CFN_GATHER_LOAD
1268 102339 : || cfn == CFN_SCATTER_STORE)
1269 : ldst_p = true;
1270 : else if (cfn == CFN_MASK_LOAD
1271 : || cfn == CFN_MASK_GATHER_LOAD
1272 : || cfn == CFN_MASK_LEN_GATHER_LOAD
1273 : || cfn == CFN_MASK_SCATTER_STORE
1274 : || cfn == CFN_MASK_LEN_SCATTER_STORE)
1275 : {
1276 : ldst_p = true;
1277 : ldst_masklen_p = true;
1278 : }
1279 : else if (cfn == CFN_MASK_STORE)
1280 : {
1281 : ldst_p = true;
1282 : ldst_masklen_p = true;
1283 : rhs_code = CFN_MASK_STORE;
1284 : }
1285 : else if (cfn == CFN_GOMP_SIMD_LANE)
1286 : ;
1287 90805 : else if ((cfn != CFN_LAST
1288 : && cfn != CFN_MASK_CALL
1289 47012 : && internal_fn_p (cfn)
1290 36867 : && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1291 90730 : || gimple_call_tail_p (call_stmt)
1292 90730 : || gimple_call_noreturn_p (call_stmt)
1293 181535 : || gimple_call_chain (call_stmt))
1294 : {
1295 424 : if (dump_enabled_p ())
1296 13 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1297 : "Build SLP failed: unsupported call type %G",
1298 : (gimple *) call_stmt);
1299 424 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1300 64 : continue;
1301 : /* Fatal mismatch. */
1302 360 : matches[0] = false;
1303 360 : return false;
1304 : }
1305 : }
1306 16737163 : else if (gimple_code (stmt) == GIMPLE_PHI)
1307 : {
1308 : rhs_code = ERROR_MARK;
1309 : phi_p = true;
1310 : }
1311 : else
1312 : {
1313 15953623 : rhs_code = gimple_assign_rhs_code (stmt);
1314 15953623 : ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1315 : }
1316 :
1317 : /* Check the operation. */
1318 16839078 : if (i == 0)
1319 : {
1320 5028763 : first_lhs = lhs;
1321 5028763 : first_stmt_code = rhs_code;
1322 5028763 : first_stmt_ldst_p = ldst_p;
1323 5028763 : first_stmt_ldst_masklen_p = ldst_masklen_p;
1324 5028763 : first_stmt_phi_p = phi_p;
1325 5028763 : first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1326 :
1327 : /* Shift arguments should be equal in all the packed stmts for a
1328 : vector shift with scalar shift operand. */
1329 5028763 : if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1330 4893717 : || rhs_code == LROTATE_EXPR
1331 9922408 : || rhs_code == RROTATE_EXPR)
1332 : {
1333 : /* First see if we have a vector/vector shift. */
1334 135501 : if (!directly_supported_p (rhs_code, vectype, optab_vector))
1335 : {
1336 : /* No vector/vector shift, try for a vector/scalar shift. */
1337 123467 : if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1338 : {
1339 11988 : if (dump_enabled_p ())
1340 386 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1341 : "Build SLP failed: "
1342 : "op not supported by target.\n");
1343 11988 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1344 : continue;
1345 : /* Fatal mismatch. */
1346 11988 : matches[0] = false;
1347 11988 : return false;
1348 : }
1349 111479 : need_same_oprnds = true;
1350 111479 : first_op1 = gimple_assign_rhs2 (stmt);
1351 : }
1352 : }
1353 4893262 : else if (rhs_code == WIDEN_LSHIFT_EXPR)
1354 : {
1355 0 : need_same_oprnds = true;
1356 0 : first_op1 = gimple_assign_rhs2 (stmt);
1357 : }
1358 4893262 : else if (!ldst_p
1359 4893262 : && rhs_code == BIT_FIELD_REF)
1360 : {
1361 5776 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1362 5776 : if (!is_a <bb_vec_info> (vinfo)
1363 5650 : || TREE_CODE (vec) != SSA_NAME
1364 : /* When the element types are not compatible we pun the
1365 : source to the target vectype which requires equal size. */
1366 11414 : || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1367 4915 : || !types_compatible_p (TREE_TYPE (vectype),
1368 4915 : TREE_TYPE (TREE_TYPE (vec))))
1369 1039 : && !operand_equal_p (TYPE_SIZE (vectype),
1370 1039 : TYPE_SIZE (TREE_TYPE (vec)))))
1371 : {
1372 781 : if (dump_enabled_p ())
1373 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1374 : "Build SLP failed: "
1375 : "BIT_FIELD_REF not supported\n");
1376 : /* Fatal mismatch. */
1377 781 : matches[0] = false;
1378 781 : return false;
1379 : }
1380 : }
1381 4887486 : else if (rhs_code == CFN_DIV_POW2)
1382 : {
1383 0 : need_same_oprnds = true;
1384 0 : first_op1 = gimple_call_arg (call_stmt, 1);
1385 : }
1386 4887486 : else if (rhs_code == CFN_GOMP_SIMD_LANE)
1387 : {
1388 3153 : need_same_oprnds = true;
1389 3153 : first_op1 = gimple_call_arg (call_stmt, 1);
1390 : }
1391 : }
1392 : else
1393 : {
1394 11810668 : if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1395 : /* For SLP reduction groups the index isn't necessarily
1396 : uniform but only that of the first stmt matters. */
1397 2161 : && !(first_reduc_idx != -1
1398 2161 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1399 2161 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1400 11810315 : && !(first_reduc_idx != -1
1401 974 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1402 974 : && rhs_code.is_tree_code ()
1403 974 : && commutative_tree_code (tree_code (rhs_code))
1404 765 : && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info)))
1405 : {
1406 353 : if (dump_enabled_p ())
1407 : {
1408 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1409 : "Build SLP failed: different reduc_idx "
1410 : "%d instead of %d in %G",
1411 : STMT_VINFO_REDUC_IDX (stmt_info),
1412 : first_reduc_idx, stmt);
1413 : }
1414 : /* Mismatch. */
1415 353 : continue;
1416 : }
1417 11809962 : if (!ldst_p
1418 9278256 : && first_stmt_code != rhs_code
1419 13208949 : && alt_stmt_code == ERROR_MARK)
1420 : alt_stmt_code = rhs_code;
1421 13183158 : if ((!ldst_p
1422 9278256 : && first_stmt_code != rhs_code
1423 1398987 : && (first_stmt_code != IMAGPART_EXPR
1424 127 : || rhs_code != REALPART_EXPR)
1425 1398967 : && (first_stmt_code != REALPART_EXPR
1426 524 : || rhs_code != IMAGPART_EXPR)
1427 : /* Handle mismatches in plus/minus by computing both
1428 : and merging the results. */
1429 1398956 : && !((((first_stmt_code == PLUS_EXPR
1430 1296073 : || first_stmt_code == MINUS_EXPR)
1431 126313 : && (alt_stmt_code == PLUS_EXPR
1432 117234 : || alt_stmt_code == MINUS_EXPR))
1433 1369707 : || ((first_stmt_code == CFN_FMA
1434 1369705 : || first_stmt_code == CFN_FMS)
1435 2 : && (alt_stmt_code == CFN_FMA
1436 2 : || alt_stmt_code == CFN_FMS)))
1437 29251 : && rhs_code == alt_stmt_code)
1438 1409515 : && !(first_stmt_code.is_tree_code ()
1439 1293668 : && rhs_code.is_tree_code ()
1440 1200600 : && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1441 : == tcc_comparison)
1442 126946 : && (swap_tree_comparison (tree_code (first_stmt_code))
1443 126946 : == tree_code (rhs_code))
1444 : && (first_reduc_idx == -1
1445 0 : || REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
1446 : || (ldst_p
1447 5063412 : && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1448 2531706 : != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1449 : || (ldst_p
1450 2488941 : && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1451 2488941 : != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1452 10436913 : || first_stmt_ldst_p != ldst_p
1453 10436774 : || (ldst_p && first_stmt_ldst_masklen_p != ldst_masklen_p)
1454 22246728 : || first_stmt_phi_p != phi_p)
1455 : {
1456 1373196 : if (dump_enabled_p ())
1457 : {
1458 2929 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459 : "Build SLP failed: different operation "
1460 : "in stmt %G", stmt);
1461 2929 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1462 : "original stmt %G", first_stmt_info->stmt);
1463 : }
1464 : /* Mismatch. */
1465 1373196 : continue;
1466 : }
1467 :
1468 10439103 : if (!ldst_p
1469 7947960 : && first_stmt_code == BIT_FIELD_REF
1470 10442499 : && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1471 5733 : != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1472 : {
1473 2337 : if (dump_enabled_p ())
1474 40 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1475 : "Build SLP failed: different BIT_FIELD_REF "
1476 : "arguments in %G", stmt);
1477 : /* Mismatch. */
1478 2337 : continue;
1479 : }
1480 :
1481 10434429 : if (call_stmt
1482 21938 : && first_stmt_code != CFN_MASK_LOAD
1483 10455881 : && first_stmt_code != CFN_MASK_STORE)
1484 : {
1485 21106 : if (!is_a <gcall *> (stmts[0]->stmt)
1486 21106 : || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1487 : call_stmt, true))
1488 : {
1489 5180 : if (dump_enabled_p ())
1490 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1491 : "Build SLP failed: different calls in %G",
1492 : stmt);
1493 : /* Mismatch. */
1494 5180 : continue;
1495 : }
1496 : }
1497 :
1498 10245661 : if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1499 11225866 : && (gimple_bb (first_stmt_info->stmt)
1500 980205 : != gimple_bb (stmt_info->stmt)))
1501 : {
1502 27021 : if (dump_enabled_p ())
1503 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1504 : "Build SLP failed: different BB for PHI "
1505 : "or possibly trapping operation in %G", stmt);
1506 : /* Mismatch. */
1507 27021 : continue;
1508 : }
1509 :
1510 10402228 : if (need_same_oprnds)
1511 : {
1512 55016 : tree other_op1 = gimple_arg (stmt, 1);
1513 55016 : if (!operand_equal_p (first_op1, other_op1, 0))
1514 : {
1515 7506 : if (dump_enabled_p ())
1516 123 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1517 : "Build SLP failed: different shift "
1518 : "arguments in %G", stmt);
1519 : /* Mismatch. */
1520 7506 : continue;
1521 : }
1522 : }
1523 :
1524 10395459 : if (first_lhs
1525 10394722 : && lhs
1526 10394722 : && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
1527 : {
1528 737 : if (dump_enabled_p ())
1529 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530 : "Build SLP failed: different vector type "
1531 : "in %G", stmt);
1532 : /* Mismatch. */
1533 737 : continue;
1534 : }
1535 : }
1536 :
1537 : /* Grouped store or load. */
1538 15409979 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1539 : {
1540 3842284 : gcc_assert (ldst_p);
1541 3842284 : if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1542 : {
1543 : /* Store. */
1544 3010046 : gcc_assert (rhs_code == CFN_MASK_STORE
1545 : || REFERENCE_CLASS_P (lhs)
1546 : || DECL_P (lhs));
1547 : }
1548 : else
1549 : {
1550 : /* Load. */
1551 832238 : first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1552 832238 : if (prev_first_load)
1553 : {
1554 : /* Check that there are no loads from different interleaving
1555 : chains in the same node. */
1556 379636 : if (prev_first_load != first_load)
1557 : {
1558 54358 : if (dump_enabled_p ())
1559 1994 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1560 : vect_location,
1561 : "Build SLP failed: different "
1562 : "interleaving chains in one node %G",
1563 : stmt);
1564 : /* Mismatch. */
1565 54358 : continue;
1566 : }
1567 : }
1568 : else
1569 : prev_first_load = first_load;
1570 : }
1571 : }
1572 : /* Non-grouped store or load. */
1573 11567695 : else if (ldst_p)
1574 : {
1575 882169 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1576 613070 : && rhs_code != CFN_GATHER_LOAD
1577 : && rhs_code != CFN_MASK_GATHER_LOAD
1578 : && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1579 : && rhs_code != CFN_SCATTER_STORE
1580 : && rhs_code != CFN_MASK_SCATTER_STORE
1581 : && rhs_code != CFN_MASK_LEN_SCATTER_STORE
1582 613070 : && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1583 : /* Not grouped loads are handled as externals for BB
1584 : vectorization. For loop vectorization we can handle
1585 : splats the same we handle single element interleaving.
1586 : Likewise we can handle a collection of invariant refs. */
1587 1476406 : && (is_a <bb_vec_info> (vinfo)
1588 594237 : || (stmt_info != first_stmt_info
1589 67872 : && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
1590 241 : && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF
1591 : (first_stmt_info)))))))
1592 : {
1593 : /* Not grouped load. */
1594 67390 : if (dump_enabled_p ())
1595 133 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596 : "Build SLP failed: not grouped load %G", stmt);
1597 :
1598 67390 : if (i != 0)
1599 67390 : continue;
1600 : /* Fatal mismatch. */
1601 0 : matches[0] = false;
1602 0 : return false;
1603 : }
1604 : }
1605 : /* Not memory operation. */
1606 : else
1607 : {
1608 10685526 : if (!phi_p
1609 10024262 : && rhs_code.is_tree_code ()
1610 9975722 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1611 1516058 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1612 939599 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1613 877672 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1614 65008 : && rhs_code != VIEW_CONVERT_EXPR
1615 : && rhs_code != CALL_EXPR
1616 : && rhs_code != BIT_FIELD_REF
1617 10685526 : && rhs_code != SSA_NAME)
1618 : {
1619 18515 : if (dump_enabled_p ())
1620 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1621 : "Build SLP failed: operation unsupported %G",
1622 : stmt);
1623 18515 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1624 0 : continue;
1625 : /* Fatal mismatch. */
1626 18515 : matches[0] = false;
1627 18515 : return false;
1628 : }
1629 :
1630 10667011 : if (rhs_code == COND_EXPR)
1631 : {
1632 59046 : tree cond_expr = gimple_assign_rhs1 (stmt);
1633 59046 : enum tree_code cond_code = TREE_CODE (cond_expr);
1634 59046 : enum tree_code swap_code = ERROR_MARK;
1635 59046 : enum tree_code invert_code = ERROR_MARK;
1636 :
1637 59046 : if (i == 0)
1638 49815 : first_cond_code = TREE_CODE (cond_expr);
1639 9231 : else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1640 : {
1641 0 : bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1642 0 : swap_code = swap_tree_comparison (cond_code);
1643 0 : invert_code = invert_tree_comparison (cond_code, honor_nans);
1644 : }
1645 :
1646 59046 : if (first_cond_code == cond_code)
1647 : ;
1648 : /* Isomorphic can be achieved by swapping. */
1649 0 : else if (first_cond_code == swap_code)
1650 0 : swap[i] = 1;
1651 : /* Isomorphic can be achieved by inverting. */
1652 0 : else if (first_cond_code == invert_code)
1653 0 : swap[i] = 2;
1654 : else
1655 : {
1656 0 : if (dump_enabled_p ())
1657 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1658 : "Build SLP failed: different"
1659 : " operation %G", stmt);
1660 : /* Mismatch. */
1661 0 : continue;
1662 : }
1663 : }
1664 :
1665 10667011 : if (i != 0
1666 7906363 : && first_stmt_code != rhs_code
1667 68691 : && first_stmt_code.is_tree_code ()
1668 68689 : && rhs_code.is_tree_code ()
1669 68689 : && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison
1670 10706636 : && (swap_tree_comparison ((tree_code)first_stmt_code)
1671 39625 : == (tree_code)rhs_code))
1672 39625 : swap[i] = 1;
1673 :
1674 10667011 : if (i != 0
1675 7906363 : && first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1676 1566 : && first_reduc_idx != -1
1677 1566 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1678 1566 : && rhs_code.is_tree_code ()
1679 1566 : && commutative_tree_code (tree_code (rhs_code))
1680 10668577 : && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info))
1681 1566 : swap[i] = 1;
1682 : }
1683 :
1684 15269716 : matches[i] = true;
1685 : }
1686 :
1687 20276853 : for (i = 0; i < group_size; ++i)
1688 15950345 : if (!matches[i])
1689 : return false;
1690 :
1691 : /* If we allowed a two-operation SLP node verify the target can cope
1692 : with the permute we are going to use. */
1693 4326508 : if (alt_stmt_code != ERROR_MARK
1694 4326508 : && (!alt_stmt_code.is_tree_code ()
1695 53525 : || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1696 53525 : && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1697 : {
1698 14473 : *two_operators = true;
1699 : }
1700 :
1701 4326508 : if (maybe_soft_fail)
1702 : {
1703 140440 : unsigned HOST_WIDE_INT const_nunits;
1704 140440 : if (!TYPE_VECTOR_SUBPARTS
1705 140440 : (soft_fail_nunits_vectype).is_constant (&const_nunits)
1706 140440 : || const_nunits > group_size)
1707 0 : matches[0] = false;
1708 : else
1709 : {
1710 : /* With constant vector elements simulate a mismatch at the
1711 : point we need to split. */
1712 140440 : unsigned tail = group_size & (const_nunits - 1);
1713 140440 : memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1714 : }
1715 140440 : return false;
1716 : }
1717 :
1718 : return true;
1719 : }
1720 :
1721 : /* Traits for the hash_set to record failed SLP builds for a stmt set.
1722 : Note we never remove apart from at destruction time so we do not
1723 : need a special value for deleted that differs from empty. */
1724 : struct bst_traits
1725 : {
1726 : typedef vec <stmt_vec_info> value_type;
1727 : typedef vec <stmt_vec_info> compare_type;
1728 : static inline hashval_t hash (value_type);
1729 : static inline bool equal (value_type existing, value_type candidate);
1730 476459549 : static inline bool is_empty (value_type x) { return !x.exists (); }
1731 106787001 : static inline bool is_deleted (value_type x) { return !x.exists (); }
1732 : static const bool empty_zero_p = true;
1733 0 : static inline void mark_empty (value_type &x) { x.release (); }
1734 : static inline void mark_deleted (value_type &x) { x.release (); }
1735 9155919 : static inline void remove (value_type &x) { x.release (); }
1736 : };
1737 : inline hashval_t
1738 93009311 : bst_traits::hash (value_type x)
1739 : {
1740 93009311 : inchash::hash h;
1741 422222311 : for (unsigned i = 0; i < x.length (); ++i)
1742 329213000 : h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1743 93009311 : return h.end ();
1744 : }
1745 : inline bool
1746 81408377 : bst_traits::equal (value_type existing, value_type candidate)
1747 : {
1748 244225131 : if (existing.length () != candidate.length ())
1749 : return false;
1750 82862564 : for (unsigned i = 0; i < existing.length (); ++i)
1751 78536704 : if (existing[i] != candidate[i])
1752 : return false;
1753 : return true;
1754 : }
1755 :
1756 : typedef hash_map <vec <stmt_vec_info>, slp_tree,
1757 : simple_hashmap_traits <bst_traits, slp_tree> >
1758 : scalar_stmts_to_slp_tree_map_t;
1759 :
1760 : /* Release BST_MAP. */
1761 :
1762 : static void
1763 1782071 : release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1764 : {
1765 : /* The map keeps a reference on SLP nodes built, release that. */
1766 10937990 : for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1767 20093909 : it != bst_map->end (); ++it)
1768 9155919 : if ((*it).second)
1769 9155919 : vect_free_slp_tree ((*it).second);
1770 1782071 : delete bst_map;
1771 1782071 : }
1772 :
1773 : /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1774 : but then vec::insert does memmove and that's not compatible with
1775 : std::pair. */
1776 : struct chain_op_t
1777 : {
1778 3627176 : chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1779 3627176 : : code (code_), dt (dt_), op (op_) {}
1780 : tree_code code;
1781 : vect_def_type dt;
1782 : tree op;
1783 : };
1784 :
1785 : /* Comparator for sorting associatable chains. */
1786 :
1787 : static int
1788 8142393 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
1789 : {
1790 8142393 : auto *op1 = (const chain_op_t *) op1_;
1791 8142393 : auto *op2 = (const chain_op_t *) op2_;
1792 8142393 : if (op1->dt != op2->dt)
1793 936800 : return (int)op1->dt - (int)op2->dt;
1794 7205593 : return (int)op1->code - (int)op2->code;
1795 : }
1796 :
1797 : /* Linearize the associatable expression chain at START with the
1798 : associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1799 : filling CHAIN with the result and using WORKLIST as intermediate storage.
1800 : CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1801 : or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1802 : stmts, starting with START. When ALLOW_ALT_CODE is false, do not
1803 : follow into MINUS_EXPR when building a PLUS chain (treat MINUS as leaf). */
1804 :
1805 : static void
1806 1639303 : vect_slp_linearize_chain (vec_info *vinfo,
1807 : vec<std::pair<tree_code, gimple *> > &worklist,
1808 : vec<chain_op_t> &chain,
1809 : enum tree_code code, gimple *start,
1810 : gimple *&code_stmt, gimple *&alt_code_stmt,
1811 : vec<gimple *> *chain_stmts,
1812 : bool allow_alt_code = true)
1813 : {
1814 : /* For each lane linearize the addition/subtraction (or other
1815 : uniform associatable operation) expression tree. */
1816 1639303 : worklist.safe_push (std::make_pair (code, start));
1817 3627176 : while (!worklist.is_empty ())
1818 : {
1819 1987873 : auto entry = worklist.pop ();
1820 1987873 : gassign *stmt = as_a <gassign *> (entry.second);
1821 1987873 : enum tree_code in_code = entry.first;
1822 3975746 : enum tree_code this_code = gimple_assign_rhs_code (stmt);
1823 : /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1824 1987873 : if (!code_stmt
1825 1987873 : && gimple_assign_rhs_code (stmt) == code)
1826 1393258 : code_stmt = stmt;
1827 594615 : else if (!alt_code_stmt
1828 594615 : && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1829 298798 : alt_code_stmt = stmt;
1830 1987873 : if (chain_stmts)
1831 1915076 : chain_stmts->safe_push (stmt);
1832 5963619 : for (unsigned opnum = 1; opnum <= 2; ++opnum)
1833 : {
1834 3975746 : tree op = gimple_op (stmt, opnum);
1835 3975746 : vect_def_type dt;
1836 3975746 : stmt_vec_info def_stmt_info;
1837 3975746 : bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1838 3975746 : gcc_assert (res);
1839 3975746 : if (dt == vect_internal_def
1840 3975746 : && is_pattern_stmt_p (def_stmt_info))
1841 8665 : op = gimple_get_lhs (def_stmt_info->stmt);
1842 3975746 : gimple *use_stmt;
1843 3975746 : use_operand_p use_p;
1844 3975746 : if (dt == vect_internal_def
1845 3694598 : && single_imm_use (op, &use_p, &use_stmt)
1846 2282959 : && is_gimple_assign (def_stmt_info->stmt)
1847 6080211 : && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1848 1756207 : || (allow_alt_code
1849 49379 : && code == PLUS_EXPR
1850 28840 : && (gimple_assign_rhs_code (def_stmt_info->stmt)
1851 : == MINUS_EXPR))))
1852 : {
1853 348570 : tree_code op_def_code = this_code;
1854 348570 : if (op_def_code == MINUS_EXPR && opnum == 1)
1855 51026 : op_def_code = PLUS_EXPR;
1856 348570 : if (in_code == MINUS_EXPR)
1857 135 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1858 348570 : worklist.safe_push (std::make_pair (op_def_code,
1859 348570 : def_stmt_info->stmt));
1860 : }
1861 : else
1862 : {
1863 3627176 : tree_code op_def_code = this_code;
1864 3627176 : if (op_def_code == MINUS_EXPR && opnum == 1)
1865 247889 : op_def_code = PLUS_EXPR;
1866 3627176 : if (in_code == MINUS_EXPR)
1867 4051 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1868 3627176 : chain.safe_push (chain_op_t (op_def_code, dt, op));
1869 : }
1870 : }
1871 : }
1872 1639303 : }
1873 :
1874 : static slp_tree
1875 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1876 : vec<stmt_vec_info> stmts, unsigned int group_size,
1877 : poly_uint64 *max_nunits,
1878 : bool *matches, unsigned *limit, unsigned *tree_size,
1879 : scalar_stmts_to_slp_tree_map_t *bst_map);
1880 :
1881 : static slp_tree
1882 6192511 : vect_build_slp_tree (vec_info *vinfo,
1883 : vec<stmt_vec_info> stmts, unsigned int group_size,
1884 : poly_uint64 *max_nunits,
1885 : bool *matches, unsigned *limit, unsigned *tree_size,
1886 : scalar_stmts_to_slp_tree_map_t *bst_map)
1887 : {
1888 6192511 : if (slp_tree *leader = bst_map->get (stmts))
1889 : {
1890 478533 : if (dump_enabled_p ())
1891 17108 : dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1892 17108 : !(*leader)->failed ? "" : "failed ",
1893 : (void *) *leader);
1894 478533 : if (!(*leader)->failed)
1895 : {
1896 431669 : SLP_TREE_REF_COUNT (*leader)++;
1897 431669 : vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1898 431669 : stmts.release ();
1899 431669 : return *leader;
1900 : }
1901 46864 : memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1902 46864 : return NULL;
1903 : }
1904 :
1905 : /* Single-lane SLP doesn't have the chance of run-away, do not account
1906 : it to the limit. */
1907 5713978 : if (stmts.length () > 1)
1908 : {
1909 3141938 : if (*limit == 0)
1910 : {
1911 1235 : if (dump_enabled_p ())
1912 12 : dump_printf_loc (MSG_NOTE, vect_location,
1913 : "SLP discovery limit exceeded\n");
1914 1235 : memset (matches, 0, sizeof (bool) * group_size);
1915 1235 : return NULL;
1916 : }
1917 3140703 : --*limit;
1918 : }
1919 :
1920 : /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1921 : so we can pick up backedge destinations during discovery. */
1922 5712743 : slp_tree res = new _slp_tree;
1923 5712743 : SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1924 5712743 : SLP_TREE_SCALAR_STMTS (res) = stmts;
1925 5712743 : bst_map->put (stmts.copy (), res);
1926 :
1927 5712743 : if (dump_enabled_p ())
1928 145814 : dump_printf_loc (MSG_NOTE, vect_location,
1929 : "starting SLP discovery for node %p\n", (void *) res);
1930 :
1931 5712743 : poly_uint64 this_max_nunits = 1;
1932 5712743 : slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1933 : &this_max_nunits,
1934 : matches, limit, tree_size, bst_map);
1935 5712743 : if (!res_)
1936 : {
1937 1982177 : if (dump_enabled_p ())
1938 8285 : dump_printf_loc (MSG_NOTE, vect_location,
1939 : "SLP discovery for node %p failed\n", (void *) res);
1940 : /* Mark the node invalid so we can detect those when still in use
1941 : as backedge destinations. */
1942 1982177 : SLP_TREE_SCALAR_STMTS (res) = vNULL;
1943 1982177 : SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1944 1982177 : res->failed = XNEWVEC (bool, group_size);
1945 1982177 : if (flag_checking)
1946 : {
1947 : unsigned i;
1948 3520626 : for (i = 0; i < group_size; ++i)
1949 3520626 : if (!matches[i])
1950 : break;
1951 1982177 : gcc_assert (i < group_size);
1952 : }
1953 1982177 : memcpy (res->failed, matches, sizeof (bool) * group_size);
1954 : }
1955 : else
1956 : {
1957 3730566 : if (dump_enabled_p ())
1958 137529 : dump_printf_loc (MSG_NOTE, vect_location,
1959 : "SLP discovery for node %p succeeded\n",
1960 : (void *) res);
1961 3730566 : gcc_assert (res_ == res);
1962 3730566 : res->max_nunits = this_max_nunits;
1963 3730566 : vect_update_max_nunits (max_nunits, this_max_nunits);
1964 : /* Keep a reference for the bst_map use. */
1965 3730566 : SLP_TREE_REF_COUNT (res)++;
1966 : }
1967 : return res_;
1968 : }
1969 :
1970 : /* Helper for building an associated SLP node chain. */
1971 :
1972 : static void
1973 178 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1974 : slp_tree op0, slp_tree op1,
1975 : stmt_vec_info oper1, stmt_vec_info oper2,
1976 : vec<std::pair<unsigned, unsigned> > lperm)
1977 : {
1978 178 : unsigned group_size = SLP_TREE_LANES (op1);
1979 :
1980 178 : slp_tree child1 = new _slp_tree;
1981 178 : SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1982 178 : SLP_TREE_VECTYPE (child1) = vectype;
1983 178 : SLP_TREE_LANES (child1) = group_size;
1984 178 : SLP_TREE_CHILDREN (child1).create (2);
1985 178 : SLP_TREE_CHILDREN (child1).quick_push (op0);
1986 178 : SLP_TREE_CHILDREN (child1).quick_push (op1);
1987 178 : SLP_TREE_REPRESENTATIVE (child1) = oper1;
1988 :
1989 178 : slp_tree child2 = new _slp_tree;
1990 178 : SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1991 178 : SLP_TREE_VECTYPE (child2) = vectype;
1992 178 : SLP_TREE_LANES (child2) = group_size;
1993 178 : SLP_TREE_CHILDREN (child2).create (2);
1994 178 : SLP_TREE_CHILDREN (child2).quick_push (op0);
1995 178 : SLP_TREE_REF_COUNT (op0)++;
1996 178 : SLP_TREE_CHILDREN (child2).quick_push (op1);
1997 178 : SLP_TREE_REF_COUNT (op1)++;
1998 178 : SLP_TREE_REPRESENTATIVE (child2) = oper2;
1999 :
2000 178 : SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
2001 178 : SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
2002 178 : SLP_TREE_VECTYPE (perm) = vectype;
2003 178 : SLP_TREE_LANES (perm) = group_size;
2004 : /* ??? We should set this NULL but that's not expected. */
2005 178 : SLP_TREE_REPRESENTATIVE (perm) = oper1;
2006 178 : SLP_TREE_LANE_PERMUTATION (perm) = lperm;
2007 178 : SLP_TREE_CHILDREN (perm).quick_push (child1);
2008 178 : SLP_TREE_CHILDREN (perm).quick_push (child2);
2009 178 : }
2010 :
2011 : /* Recursively build an SLP tree starting from NODE.
2012 : Fail (and return a value not equal to zero) if def-stmts are not
2013 : isomorphic, require data permutation or are of unsupported types of
2014 : operation. Otherwise, return 0.
2015 : The value returned is the depth in the SLP tree where a mismatch
2016 : was found. */
2017 :
2018 : static slp_tree
2019 5712743 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
2020 : vec<stmt_vec_info> stmts, unsigned int group_size,
2021 : poly_uint64 *max_nunits,
2022 : bool *matches, unsigned *limit, unsigned *tree_size,
2023 : scalar_stmts_to_slp_tree_map_t *bst_map)
2024 : {
2025 5712743 : unsigned nops, i, this_tree_size = 0;
2026 5712743 : poly_uint64 this_max_nunits = *max_nunits;
2027 :
2028 5712743 : matches[0] = false;
2029 :
2030 5712743 : stmt_vec_info stmt_info = stmts[0];
2031 5712743 : if (!is_a<gcall *> (stmt_info->stmt)
2032 : && !is_a<gassign *> (stmt_info->stmt)
2033 : && !is_a<gphi *> (stmt_info->stmt))
2034 : return NULL;
2035 :
2036 5712672 : nops = gimple_num_args (stmt_info->stmt);
2037 5712672 : if (const int *map = vect_get_operand_map (stmt_info))
2038 35090 : nops = map[0];
2039 :
2040 : /* If the SLP node is a PHI (induction or reduction), terminate
2041 : the recursion. */
2042 5712672 : bool *skip_args = XALLOCAVEC (bool, nops);
2043 5712672 : memset (skip_args, 0, sizeof (bool) * nops);
2044 5712672 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2045 2768061 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
2046 : {
2047 298015 : tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
2048 298015 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2049 : group_size);
2050 298015 : if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
2051 : max_nunits))
2052 : return NULL;
2053 :
2054 294141 : vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
2055 294141 : if (def_type == vect_induction_def)
2056 : {
2057 : /* Induction PHIs are not cycles but walk the initial
2058 : value. Only for inner loops through, for outer loops
2059 : we need to pick up the value from the actual PHIs
2060 : to more easily support peeling and epilogue vectorization. */
2061 188288 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2062 188288 : if (!nested_in_vect_loop_p (loop, stmt_info))
2063 187464 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
2064 : else
2065 : loop = loop->inner;
2066 188288 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2067 : }
2068 105853 : else if (def_type == vect_reduction_def
2069 : || def_type == vect_double_reduction_def
2070 : || def_type == vect_nested_cycle
2071 105853 : || def_type == vect_first_order_recurrence)
2072 : {
2073 : /* Else def types have to match. */
2074 : stmt_vec_info other_info;
2075 : bool all_same = true;
2076 239614 : FOR_EACH_VEC_ELT (stmts, i, other_info)
2077 : {
2078 135075 : if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
2079 1739083 : return NULL;
2080 135069 : if (other_info != stmt_info)
2081 26183 : all_same = false;
2082 : }
2083 104539 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2084 : /* Reduction initial values are not explicitly represented. */
2085 104539 : if (def_type != vect_first_order_recurrence
2086 104539 : && gimple_bb (stmt_info->stmt) == loop->header)
2087 101394 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
2088 : /* Reduction chain backedge defs are filled manually.
2089 : ??? Need a better way to identify a SLP reduction chain PHI.
2090 : Or a better overall way to SLP match those. */
2091 104539 : if (stmts.length () > 1
2092 104539 : && all_same && def_type == vect_reduction_def)
2093 2311 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2094 : }
2095 1308 : else if (def_type != vect_internal_def)
2096 : return NULL;
2097 : }
2098 :
2099 :
2100 5708792 : bool two_operators = false;
2101 5708792 : unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
2102 5708792 : tree vectype = NULL_TREE;
2103 5708792 : if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
2104 : &this_max_nunits, matches, &two_operators,
2105 : &vectype))
2106 : return NULL;
2107 :
2108 : /* If the SLP node is a load, terminate the recursion unless masked. */
2109 4186068 : if (STMT_VINFO_DATA_REF (stmt_info)
2110 2022333 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2111 : {
2112 895949 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2113 : gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
2114 : else
2115 : {
2116 877367 : *max_nunits = this_max_nunits;
2117 877367 : (*tree_size)++;
2118 877367 : node = vect_create_new_slp_node (node, stmts, 0);
2119 877367 : SLP_TREE_VECTYPE (node) = vectype;
2120 : /* And compute the load permutation. Whether it is actually
2121 : a permutation depends on the unrolling factor which is
2122 : decided later. */
2123 877367 : vec<unsigned> load_permutation;
2124 877367 : int j;
2125 877367 : stmt_vec_info load_info;
2126 877367 : load_permutation.create (group_size);
2127 877367 : stmt_vec_info first_stmt_info
2128 877367 : = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2129 877367 : ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
2130 877367 : bool any_permute = false;
2131 2114986 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2132 : {
2133 1237619 : int load_place;
2134 1237619 : if (! load_info)
2135 : {
2136 39944 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2137 : load_place = j;
2138 : else
2139 : load_place = 0;
2140 : }
2141 1197675 : else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2142 698766 : load_place = vect_get_place_in_interleaving_chain
2143 698766 : (load_info, first_stmt_info);
2144 : else
2145 : /* Recognize the splat case as { 0, 0, ... } but make
2146 : sure to use the appropriate refs for collections
2147 : of invariant refs. */
2148 498909 : load_place = (load_info == stmt_info) ? 0 : j;
2149 738951 : gcc_assert (load_place != -1);
2150 1237619 : any_permute |= load_place != j;
2151 1237619 : load_permutation.quick_push (load_place);
2152 : }
2153 :
2154 877367 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2155 : {
2156 3406 : gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
2157 3406 : bool has_gaps = false;
2158 3406 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2159 209 : for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
2160 1346 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2161 1137 : if (DR_GROUP_GAP (si) != 1)
2162 160 : has_gaps = true;
2163 : /* We cannot handle permuted masked loads directly, see
2164 : PR114375. We cannot handle strided masked loads or masked
2165 : loads with gaps unless the mask is uniform. */
2166 3406 : if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
2167 209 : && (DR_GROUP_GAP (first_stmt_info) != 0
2168 149 : || (has_gaps
2169 55 : && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
2170 6717 : || STMT_VINFO_STRIDED_P (stmt_info))
2171 : {
2172 108 : load_permutation.release ();
2173 108 : matches[0] = false;
2174 874113 : return NULL;
2175 : }
2176 :
2177 : /* For permuted masked loads do an unpermuted masked load of
2178 : the whole group followed by a SLP permute node. */
2179 3298 : if (any_permute
2180 3298 : || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2181 84 : && DR_GROUP_SIZE (first_stmt_info) != group_size))
2182 : {
2183 : /* Discover the whole unpermuted load. */
2184 44 : vec<stmt_vec_info> stmts2;
2185 44 : unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2186 78 : ? DR_GROUP_SIZE (first_stmt_info) : 1;
2187 44 : stmts2.create (dr_group_size);
2188 44 : stmts2.quick_grow_cleared (dr_group_size);
2189 44 : unsigned i = 0;
2190 44 : for (stmt_vec_info si = first_stmt_info;
2191 594 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2192 : {
2193 550 : if (si != first_stmt_info)
2194 2106 : for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
2195 1600 : stmts2[i++] = NULL;
2196 550 : stmts2[i++] = si;
2197 : }
2198 44 : bool *matches2 = XALLOCAVEC (bool, dr_group_size);
2199 44 : slp_tree unperm_load
2200 44 : = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
2201 : &this_max_nunits, matches2, limit,
2202 44 : &this_tree_size, bst_map);
2203 : /* When we are able to do the full masked load emit that
2204 : followed by 'node' being the desired final permutation. */
2205 44 : if (unperm_load)
2206 : {
2207 16 : gcc_assert
2208 : (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
2209 16 : lane_permutation_t lperm;
2210 16 : lperm.create (group_size);
2211 56 : for (unsigned j = 0; j < load_permutation.length (); ++j)
2212 40 : lperm.quick_push
2213 40 : (std::make_pair (0, load_permutation[j]));
2214 16 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2215 16 : SLP_TREE_CHILDREN (node).safe_push (unperm_load);
2216 16 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2217 16 : load_permutation.release ();
2218 16 : return node;
2219 : }
2220 28 : stmts2.release ();
2221 28 : load_permutation.release ();
2222 28 : matches[0] = false;
2223 28 : return NULL;
2224 : }
2225 3254 : load_permutation.release ();
2226 : }
2227 : else
2228 : {
2229 873961 : if (!any_permute
2230 761472 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2231 1162510 : && group_size == DR_GROUP_SIZE (first_stmt_info))
2232 126151 : load_permutation.release ();
2233 873961 : SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2234 873961 : return node;
2235 : }
2236 : }
2237 : }
2238 3290119 : else if (gimple_assign_single_p (stmt_info->stmt)
2239 2257228 : && !gimple_vuse (stmt_info->stmt)
2240 3297915 : && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2241 : {
2242 : /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2243 : the same SSA name vector of a compatible type to vectype. */
2244 2367 : vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2245 2367 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2246 2367 : stmt_vec_info estmt_info;
2247 7443 : FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2248 : {
2249 5223 : gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2250 5223 : tree bfref = gimple_assign_rhs1 (estmt);
2251 5223 : HOST_WIDE_INT lane;
2252 5223 : if (!known_eq (bit_field_size (bfref),
2253 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2254 10299 : || !constant_multiple_p (bit_field_offset (bfref),
2255 5076 : bit_field_size (bfref), &lane))
2256 : {
2257 147 : lperm.release ();
2258 147 : matches[0] = false;
2259 147 : return NULL;
2260 : }
2261 5076 : lperm.safe_push (std::make_pair (0, (unsigned)lane));
2262 : }
2263 2220 : slp_tree vnode = vect_create_new_slp_node (vNULL);
2264 2220 : if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2265 : /* ??? We record vectype here but we hide eventually necessary
2266 : punning and instead rely on code generation to materialize
2267 : VIEW_CONVERT_EXPRs as necessary. We instead should make
2268 : this explicit somehow. */
2269 704 : SLP_TREE_VECTYPE (vnode) = vectype;
2270 : else
2271 : {
2272 : /* For different size but compatible elements we can still
2273 : use VEC_PERM_EXPR without punning. */
2274 1516 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2275 : && types_compatible_p (TREE_TYPE (vectype),
2276 : TREE_TYPE (TREE_TYPE (vec))));
2277 1516 : SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2278 : }
2279 2220 : auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2280 2220 : unsigned HOST_WIDE_INT const_nunits;
2281 2220 : if (nunits.is_constant (&const_nunits))
2282 2220 : SLP_TREE_LANES (vnode) = const_nunits;
2283 2220 : SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2284 : /* We are always building a permutation node even if it is an identity
2285 : permute to shield the rest of the vectorizer from the odd node
2286 : representing an actual vector without any scalar ops.
2287 : ??? We could hide it completely with making the permute node
2288 : external? */
2289 2220 : node = vect_create_new_slp_node (node, stmts, 1);
2290 2220 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2291 2220 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2292 2220 : SLP_TREE_VECTYPE (node) = vectype;
2293 2220 : SLP_TREE_CHILDREN (node).quick_push (vnode);
2294 2220 : return node;
2295 : }
2296 : /* When discovery reaches an associatable operation see whether we can
2297 : improve that to match up lanes in a way superior to the operand
2298 : swapping code which at most looks at two defs.
2299 : ??? For BB vectorization we cannot do the brute-force search
2300 : for matching as we can succeed by means of builds from scalars
2301 : and have no good way to "cost" one build against another. */
2302 3287752 : else if (is_a <loop_vec_info> (vinfo)
2303 : /* Do not bother for single-lane SLP. */
2304 1955914 : && group_size > 1
2305 : /* ??? We don't handle !vect_internal_def defs below. */
2306 111410 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2307 : /* ??? Do not associate a reduction, this will wreck REDUC_IDX
2308 : mapping as long as that exists on the stmt_info level. */
2309 86051 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2310 77556 : && is_gimple_assign (stmt_info->stmt)
2311 77242 : && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2312 50678 : || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2313 3316224 : && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2314 16258 : || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2315 13736 : && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2316 : {
2317 : /* See if we have a chain of (mixed) adds or subtracts or other
2318 : associatable ops. */
2319 21439 : enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2320 21439 : if (code == MINUS_EXPR)
2321 796 : code = PLUS_EXPR;
2322 21439 : stmt_vec_info other_op_stmt_info = NULL;
2323 21439 : stmt_vec_info op_stmt_info = NULL;
2324 21439 : unsigned chain_len = 0;
2325 21439 : auto_vec<chain_op_t> chain;
2326 21439 : auto_vec<std::pair<tree_code, gimple *> > worklist;
2327 21439 : auto_vec<vec<chain_op_t> > chains (group_size);
2328 21439 : auto_vec<slp_tree, 4> children;
2329 21439 : bool hard_fail = true;
2330 22506 : for (unsigned lane = 0; lane < group_size; ++lane)
2331 : {
2332 22150 : if (!stmts[lane])
2333 : {
2334 : /* ??? Below we require lane zero is present. */
2335 0 : if (lane == 0)
2336 : {
2337 : hard_fail = false;
2338 21083 : break;
2339 : }
2340 0 : chains.quick_push (vNULL);
2341 0 : continue;
2342 : }
2343 : /* For each lane linearize the addition/subtraction (or other
2344 : uniform associatable operation) expression tree. */
2345 22150 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
2346 22150 : vect_slp_linearize_chain (vinfo, worklist, chain, code,
2347 22150 : stmts[lane]->stmt, op_stmt, other_op_stmt,
2348 : NULL);
2349 22150 : if (!op_stmt_info && op_stmt)
2350 20860 : op_stmt_info = vinfo->lookup_stmt (op_stmt);
2351 22150 : if (!other_op_stmt_info && other_op_stmt)
2352 832 : other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2353 22150 : if (chain.length () == 2)
2354 : {
2355 : /* In a chain of just two elements resort to the regular
2356 : operand swapping scheme. Likewise if we run into a
2357 : length mismatch process regularly as well as we did not
2358 : process the other lanes we cannot report a good hint what
2359 : lanes to try swapping in the parent. */
2360 : hard_fail = false;
2361 : break;
2362 : }
2363 1070 : else if (chain_len == 0)
2364 396 : chain_len = chain.length ();
2365 1348 : else if (chain.length () != chain_len)
2366 : {
2367 : /* ??? Here we could slip in magic to compensate with
2368 : neutral operands. */
2369 3 : matches[lane] = false;
2370 3 : if (lane != group_size - 1)
2371 3 : matches[0] = false;
2372 : break;
2373 : }
2374 1067 : chains.quick_push (chain.copy ());
2375 1067 : chain.truncate (0);
2376 : }
2377 42878 : if (chains.length () == group_size)
2378 : {
2379 : /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2380 356 : if (!op_stmt_info)
2381 : {
2382 3 : hard_fail = false;
2383 3 : goto out;
2384 : }
2385 : /* Now we have a set of chains with the same length. */
2386 : /* 1. pre-sort according to def_type and operation. */
2387 1308 : for (unsigned lane = 0; lane < group_size; ++lane)
2388 1910 : chains[lane].stablesort (dt_sort_cmp, vinfo);
2389 353 : if (dump_enabled_p ())
2390 : {
2391 157 : dump_printf_loc (MSG_NOTE, vect_location,
2392 : "pre-sorted chains of %s\n",
2393 : get_tree_code_name (code));
2394 685 : for (unsigned lane = 0; lane < group_size; ++lane)
2395 : {
2396 528 : if (!stmts[lane])
2397 0 : dump_printf (MSG_NOTE, "--");
2398 : else
2399 2422 : for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2400 3788 : dump_printf (MSG_NOTE, "%s %T ",
2401 1894 : get_tree_code_name (chains[lane][opnum].code),
2402 1894 : chains[lane][opnum].op);
2403 528 : dump_printf (MSG_NOTE, "\n");
2404 : }
2405 : }
2406 : /* 2. try to build children nodes, associating as necessary. */
2407 : /* 2a. prepare and perform early checks to avoid eating into
2408 : discovery limit unnecessarily. */
2409 353 : vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
2410 1487 : for (unsigned n = 0; n < chain_len; ++n)
2411 : {
2412 1134 : vect_def_type dt = chains[0][n].dt;
2413 1134 : unsigned lane;
2414 4357 : for (lane = 0; lane < group_size; ++lane)
2415 6446 : if (stmts[lane] && chains[lane][n].dt != dt)
2416 : {
2417 0 : if (dt == vect_constant_def
2418 0 : && chains[lane][n].dt == vect_external_def)
2419 : dt = vect_external_def;
2420 0 : else if (dt == vect_external_def
2421 0 : && chains[lane][n].dt == vect_constant_def)
2422 : ;
2423 : else
2424 : break;
2425 : }
2426 1134 : if (lane != group_size)
2427 : {
2428 0 : if (dump_enabled_p ())
2429 0 : dump_printf_loc (MSG_NOTE, vect_location,
2430 : "giving up on chain due to mismatched "
2431 : "def types\n");
2432 0 : matches[lane] = false;
2433 0 : if (lane != group_size - 1)
2434 0 : matches[0] = false;
2435 0 : goto out;
2436 : }
2437 1134 : dts[n] = dt;
2438 1134 : if (dt == vect_constant_def
2439 1134 : || dt == vect_external_def)
2440 : {
2441 : /* Check whether we can build the invariant. If we can't
2442 : we never will be able to. */
2443 93 : tree type = TREE_TYPE (chains[0][n].op);
2444 1134 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2445 : && (TREE_CODE (type) == BOOLEAN_TYPE
2446 : || !can_duplicate_and_interleave_p (vinfo, group_size,
2447 : type)))
2448 : {
2449 : matches[0] = false;
2450 : goto out;
2451 : }
2452 : }
2453 1041 : else if (dt != vect_internal_def)
2454 : {
2455 : /* Not sure, we might need sth special.
2456 : gcc.dg/vect/pr96854.c,
2457 : gfortran.dg/vect/fast-math-pr37021.f90
2458 : and gfortran.dg/vect/pr61171.f trigger. */
2459 : /* Soft-fail for now. */
2460 0 : hard_fail = false;
2461 0 : goto out;
2462 : }
2463 : }
2464 : /* 2b. do the actual build. */
2465 1429 : for (unsigned n = 0; n < chain_len; ++n)
2466 : {
2467 1096 : vect_def_type dt = dts[n];
2468 1096 : unsigned lane;
2469 1096 : if (dt == vect_constant_def
2470 1096 : || dt == vect_external_def)
2471 : {
2472 93 : vec<tree> ops;
2473 93 : ops.create (group_size);
2474 461 : for (lane = 0; lane < group_size; ++lane)
2475 275 : if (stmts[lane])
2476 275 : ops.quick_push (chains[lane][n].op);
2477 : else
2478 0 : ops.quick_push (NULL_TREE);
2479 93 : slp_tree child = vect_create_new_slp_node (ops);
2480 93 : SLP_TREE_DEF_TYPE (child) = dt;
2481 93 : children.safe_push (child);
2482 : }
2483 : else
2484 : {
2485 1003 : vec<stmt_vec_info> op_stmts;
2486 1003 : op_stmts.create (group_size);
2487 1003 : slp_tree child = NULL;
2488 : /* Brute-force our way. We have to consider a lane
2489 : failing after fixing an earlier fail up in the
2490 : SLP discovery recursion. So track the current
2491 : permute per lane. */
2492 1003 : unsigned *perms = XALLOCAVEC (unsigned, group_size);
2493 1003 : memset (perms, 0, sizeof (unsigned) * group_size);
2494 1097 : do
2495 : {
2496 1097 : op_stmts.truncate (0);
2497 5320 : for (lane = 0; lane < group_size; ++lane)
2498 3126 : if (stmts[lane])
2499 3126 : op_stmts.quick_push
2500 3126 : (vinfo->lookup_def (chains[lane][n].op));
2501 : else
2502 0 : op_stmts.quick_push (NULL);
2503 1097 : child = vect_build_slp_tree (vinfo, op_stmts,
2504 : group_size, &this_max_nunits,
2505 : matches, limit,
2506 : &this_tree_size, bst_map);
2507 : /* ??? We're likely getting too many fatal mismatches
2508 : here so maybe we want to ignore them (but then we
2509 : have no idea which lanes fatally mismatched). */
2510 1097 : if (child || !matches[0])
2511 : break;
2512 : /* Swap another lane we have not yet matched up into
2513 : lanes that did not match. If we run out of
2514 : permute possibilities for a lane terminate the
2515 : search. */
2516 287 : bool term = false;
2517 287 : for (lane = 1; lane < group_size; ++lane)
2518 193 : if (!matches[lane])
2519 : {
2520 165 : if (n + perms[lane] + 1 == chain_len)
2521 : {
2522 : term = true;
2523 : break;
2524 : }
2525 146 : if (dump_enabled_p ())
2526 113 : dump_printf_loc (MSG_NOTE, vect_location,
2527 : "swapping operand %d and %d "
2528 : "of lane %d\n",
2529 : n, n + perms[lane] + 1, lane);
2530 292 : std::swap (chains[lane][n],
2531 146 : chains[lane][n + perms[lane] + 1]);
2532 146 : perms[lane]++;
2533 : }
2534 113 : if (term)
2535 : break;
2536 : }
2537 : while (1);
2538 1003 : if (!child)
2539 : {
2540 20 : if (dump_enabled_p ())
2541 18 : dump_printf_loc (MSG_NOTE, vect_location,
2542 : "failed to match up op %d\n", n);
2543 20 : op_stmts.release ();
2544 20 : if (lane != group_size - 1)
2545 10 : matches[0] = false;
2546 : else
2547 10 : matches[lane] = false;
2548 20 : goto out;
2549 : }
2550 983 : if (dump_enabled_p ())
2551 : {
2552 421 : dump_printf_loc (MSG_NOTE, vect_location,
2553 : "matched up op %d to\n", n);
2554 421 : vect_print_slp_tree (MSG_NOTE, vect_location, child);
2555 : }
2556 983 : children.safe_push (child);
2557 : }
2558 : }
2559 : /* 3. build SLP nodes to combine the chain. */
2560 1213 : for (unsigned lane = 0; lane < group_size; ++lane)
2561 1772 : if (stmts[lane] && chains[lane][0].code != code)
2562 : {
2563 : /* See if there's any alternate all-PLUS entry. */
2564 : unsigned n;
2565 6 : for (n = 1; n < chain_len; ++n)
2566 : {
2567 30 : for (lane = 0; lane < group_size; ++lane)
2568 48 : if (stmts[lane] && chains[lane][n].code != code)
2569 : break;
2570 6 : if (lane == group_size)
2571 : break;
2572 : }
2573 6 : if (n != chain_len)
2574 : {
2575 : /* Swap that in at first position. */
2576 6 : std::swap (children[0], children[n]);
2577 30 : for (lane = 0; lane < group_size; ++lane)
2578 24 : if (stmts[lane])
2579 24 : std::swap (chains[lane][0], chains[lane][n]);
2580 : }
2581 : else
2582 : {
2583 : /* ??? When this triggers and we end up with two
2584 : vect_constant/external_def up-front things break (ICE)
2585 : spectacularly finding an insertion place for the
2586 : all-constant op. We should have a fully
2587 : vect_internal_def operand though(?) so we can swap
2588 : that into first place and then prepend the all-zero
2589 : constant. */
2590 0 : if (dump_enabled_p ())
2591 0 : dump_printf_loc (MSG_NOTE, vect_location,
2592 : "inserting constant zero to compensate "
2593 : "for (partially) negated first "
2594 : "operand\n");
2595 0 : chain_len++;
2596 0 : for (lane = 0; lane < group_size; ++lane)
2597 0 : if (stmts[lane])
2598 0 : chains[lane].safe_insert
2599 0 : (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2600 0 : vec<tree> zero_ops;
2601 0 : zero_ops.create (group_size);
2602 0 : zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2603 0 : for (lane = 1; lane < group_size; ++lane)
2604 0 : if (stmts[lane])
2605 0 : zero_ops.quick_push (zero_ops[0]);
2606 : else
2607 0 : zero_ops.quick_push (NULL_TREE);
2608 0 : slp_tree zero = vect_create_new_slp_node (zero_ops);
2609 0 : SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2610 0 : children.safe_insert (0, zero);
2611 : }
2612 : break;
2613 : }
2614 1071 : for (unsigned i = 1; i < children.length (); ++i)
2615 : {
2616 738 : slp_tree op0 = children[i - 1];
2617 738 : slp_tree op1 = children[i];
2618 738 : bool this_two_op = false;
2619 2660 : for (unsigned lane = 0; lane < group_size; ++lane)
2620 4200 : if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
2621 : {
2622 : this_two_op = true;
2623 : break;
2624 : }
2625 738 : slp_tree child;
2626 738 : if (i == children.length () - 1)
2627 333 : child = vect_create_new_slp_node (node, stmts, 2);
2628 : else
2629 405 : child = vect_create_new_slp_node (2, ERROR_MARK);
2630 738 : if (this_two_op)
2631 : {
2632 178 : vec<std::pair<unsigned, unsigned> > lperm;
2633 178 : lperm.create (group_size);
2634 630 : for (unsigned lane = 0; lane < group_size; ++lane)
2635 904 : lperm.quick_push (std::make_pair
2636 452 : (chains[lane][i].code != chains[0][i].code, lane));
2637 356 : vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2638 178 : (chains[0][i].code == code
2639 : ? op_stmt_info
2640 : : other_op_stmt_info),
2641 178 : (chains[0][i].code == code
2642 : ? other_op_stmt_info
2643 : : op_stmt_info),
2644 : lperm);
2645 : }
2646 : else
2647 : {
2648 560 : SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2649 560 : SLP_TREE_VECTYPE (child) = vectype;
2650 560 : SLP_TREE_LANES (child) = group_size;
2651 560 : SLP_TREE_CHILDREN (child).quick_push (op0);
2652 560 : SLP_TREE_CHILDREN (child).quick_push (op1);
2653 560 : SLP_TREE_REPRESENTATIVE (child)
2654 1120 : = (chains[0][i].code == code
2655 560 : ? op_stmt_info : other_op_stmt_info);
2656 : }
2657 738 : children[i] = child;
2658 : }
2659 333 : *tree_size += this_tree_size + 1;
2660 333 : *max_nunits = this_max_nunits;
2661 1593 : while (!chains.is_empty ())
2662 904 : chains.pop ().release ();
2663 : return node;
2664 : }
2665 21083 : out:
2666 21106 : if (dump_enabled_p ())
2667 2809 : dump_printf_loc (MSG_NOTE, vect_location,
2668 : "failed to line up SLP graph by re-associating "
2669 : "operations in lanes%s\n",
2670 : !hard_fail ? " trying regular discovery" : "");
2671 21111 : while (!children.is_empty ())
2672 5 : vect_free_slp_tree (children.pop ());
2673 21269 : while (!chains.is_empty ())
2674 163 : chains.pop ().release ();
2675 : /* Hard-fail, otherwise we might run into quadratic processing of the
2676 : chains starting one stmt into the chain again. */
2677 21106 : if (hard_fail)
2678 : return NULL;
2679 : /* Fall thru to normal processing. */
2680 21439 : }
2681 :
2682 : /* Get at the operands, verifying they are compatible. */
2683 3309232 : vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2684 3309232 : slp_oprnd_info oprnd_info;
2685 15980564 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2686 : {
2687 25345022 : int res = vect_get_and_check_slp_defs (vinfo, vectype,
2688 12672511 : swap[i], skip_args,
2689 : stmts, i, &oprnds_info);
2690 12672511 : if (res != 0)
2691 541769 : matches[(res == -1) ? 0 : i] = false;
2692 12672511 : if (!matches[0])
2693 : break;
2694 : }
2695 15670595 : for (i = 0; i < group_size; ++i)
2696 12573771 : if (!matches[i])
2697 : {
2698 212408 : vect_free_oprnd_info (oprnds_info);
2699 212408 : return NULL;
2700 : }
2701 9290472 : swap = NULL;
2702 :
2703 9290472 : bool has_two_operators_perm = false;
2704 18580944 : auto_vec<unsigned> two_op_perm_indices[2];
2705 3096824 : vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2706 :
2707 3111069 : if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2708 : {
2709 3867 : unsigned idx = 0;
2710 3867 : hash_map<gimple *, unsigned> seen;
2711 3867 : vec<slp_oprnd_info> new_oprnds_info
2712 3867 : = vect_create_oprnd_info (1, group_size);
2713 3867 : bool success = true;
2714 :
2715 3867 : enum tree_code code = ERROR_MARK;
2716 3867 : if (oprnds_info[0]->def_stmts[0]
2717 3867 : && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2718 3809 : code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2719 3867 : basic_block bb = nullptr;
2720 :
2721 7470 : for (unsigned j = 0; j < group_size; ++j)
2722 : {
2723 17480 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2724 : {
2725 13877 : stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2726 13877 : if (!stmt_info
2727 13654 : || !is_a<gassign *> (stmt_info->stmt)
2728 13651 : || gimple_assign_rhs_code (stmt_info->stmt) != code
2729 24350 : || skip_args[i])
2730 : {
2731 : success = false;
2732 3408 : break;
2733 : }
2734 : /* Avoid mixing lanes with defs in different basic-blocks. */
2735 10473 : if (!bb)
2736 3985 : bb = gimple_bb (vect_orig_stmt (stmt_info)->stmt);
2737 8252 : else if (gimple_bb (vect_orig_stmt (stmt_info)->stmt) != bb)
2738 : {
2739 : success = false;
2740 : break;
2741 : }
2742 :
2743 10469 : bool exists;
2744 10469 : unsigned &stmt_idx
2745 10469 : = seen.get_or_insert (stmt_info->stmt, &exists);
2746 :
2747 10469 : if (!exists)
2748 : {
2749 9128 : new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2750 9128 : new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2751 9128 : stmt_idx = idx;
2752 9128 : idx++;
2753 : }
2754 :
2755 10469 : two_op_perm_indices[i].safe_push (stmt_idx);
2756 : }
2757 :
2758 7011 : if (!success)
2759 : break;
2760 : }
2761 :
2762 3867 : if (success && idx == group_size)
2763 : {
2764 94 : if (dump_enabled_p ())
2765 : {
2766 0 : dump_printf_loc (MSG_NOTE, vect_location,
2767 : "Replace two_operators operands:\n");
2768 :
2769 0 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2770 : {
2771 0 : dump_printf_loc (MSG_NOTE, vect_location,
2772 : "Operand %u:\n", i);
2773 0 : for (unsigned j = 0; j < group_size; j++)
2774 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2775 0 : j, oprnd_info->def_stmts[j]->stmt);
2776 : }
2777 :
2778 0 : dump_printf_loc (MSG_NOTE, vect_location,
2779 : "With a single operand:\n");
2780 0 : for (unsigned j = 0; j < group_size; j++)
2781 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2782 0 : j, new_oprnds_info[0]->def_stmts[j]->stmt);
2783 : }
2784 :
2785 94 : two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2786 94 : two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2787 :
2788 94 : new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2789 94 : new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2790 94 : new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2791 94 : new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2792 94 : new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2793 :
2794 94 : vect_free_oprnd_info (oprnds_info);
2795 94 : oprnds_info = new_oprnds_info;
2796 94 : nops = 1;
2797 94 : has_two_operators_perm = true;
2798 : }
2799 : else
2800 3773 : vect_free_oprnd_info (new_oprnds_info);
2801 3867 : }
2802 :
2803 6193648 : auto_vec<slp_tree, 4> children;
2804 :
2805 3096824 : stmt_info = stmts[0];
2806 :
2807 3096824 : int reduc_idx = -1;
2808 3096824 : int gs_scale = 0;
2809 3096824 : tree gs_base = NULL_TREE;
2810 :
2811 : /* Create SLP_TREE nodes for the definition node/s. */
2812 7927696 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2813 : {
2814 4945877 : slp_tree child = nullptr;
2815 4945877 : unsigned int j;
2816 :
2817 : /* We're skipping certain operands from processing, for example
2818 : outer loop reduction initial defs. */
2819 4945877 : if (skip_args[i])
2820 : {
2821 479457 : children.safe_push (NULL);
2822 5310329 : continue;
2823 : }
2824 :
2825 4466420 : if (oprnd_info->first_dt == vect_uninitialized_def)
2826 : {
2827 : /* COND_EXPR have one too many eventually if the condition
2828 : is a SSA name. */
2829 0 : gcc_assert (i == 3 && nops == 4);
2830 0 : continue;
2831 : }
2832 :
2833 4466420 : if (oprnd_info->first_gs_p)
2834 : {
2835 22435 : gs_scale = oprnd_info->first_gs_info.scale;
2836 22435 : gs_base = oprnd_info->first_gs_info.base;
2837 : }
2838 :
2839 4466420 : if (is_a <bb_vec_info> (vinfo)
2840 1563827 : && oprnd_info->first_dt == vect_internal_def
2841 5277892 : && !oprnd_info->any_pattern)
2842 : {
2843 : /* For BB vectorization, if all defs are the same do not
2844 : bother to continue the build along the single-lane
2845 : graph but use a splat of the scalar value. */
2846 767743 : stmt_vec_info first_def = oprnd_info->def_stmts[0];
2847 823550 : for (j = 1; j < group_size; ++j)
2848 783512 : if (oprnd_info->def_stmts[j] != first_def)
2849 : break;
2850 767743 : if (j == group_size
2851 : /* But avoid doing this for loads where we may be
2852 : able to CSE things, unless the stmt is not
2853 : vectorizable. */
2854 767743 : && (!STMT_VINFO_VECTORIZABLE (first_def)
2855 49294 : || !gimple_vuse (first_def->stmt)))
2856 : {
2857 30745 : if (dump_enabled_p ())
2858 105 : dump_printf_loc (MSG_NOTE, vect_location,
2859 : "Using a splat of the uniform operand %G",
2860 : first_def->stmt);
2861 30745 : oprnd_info->first_dt = vect_external_def;
2862 : }
2863 : }
2864 :
2865 4466420 : if (oprnd_info->first_dt == vect_external_def
2866 4466420 : || oprnd_info->first_dt == vect_constant_def)
2867 : {
2868 1463619 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2869 : {
2870 : tree op0;
2871 : tree uniform_val = op0 = oprnd_info->ops[0];
2872 : for (j = 1; j < oprnd_info->ops.length (); ++j)
2873 : if (oprnd_info->ops[j]
2874 : && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
2875 : {
2876 : uniform_val = NULL_TREE;
2877 : break;
2878 : }
2879 : if (!uniform_val
2880 : && !can_duplicate_and_interleave_p (vinfo,
2881 : oprnd_info->ops.length (),
2882 : TREE_TYPE (op0)))
2883 : {
2884 : matches[j] = false;
2885 : if (dump_enabled_p ())
2886 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2887 : "Build SLP failed: invalid type of def "
2888 : "for variable-length SLP %T\n", op0);
2889 : goto fail;
2890 : }
2891 : }
2892 1463619 : slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2893 1463619 : SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2894 1463619 : oprnd_info->ops = vNULL;
2895 1463619 : children.safe_push (invnode);
2896 1463619 : continue;
2897 1463619 : }
2898 :
2899 : /* See which SLP operand a reduction chain continues on. We want
2900 : to chain even PHIs but not backedges. */
2901 3002801 : if (STMT_VINFO_REDUC_DEF (oprnd_info->def_stmts[0])
2902 3002801 : || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1)
2903 : {
2904 232673 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2905 : {
2906 756 : if (oprnd_info->first_dt == vect_double_reduction_def)
2907 378 : reduc_idx = i;
2908 : }
2909 231917 : else if (is_a <gphi *> (stmt_info->stmt)
2910 231917 : && gimple_phi_num_args
2911 99466 : (as_a <gphi *> (stmt_info->stmt)) != 1)
2912 : ;
2913 132834 : else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1
2914 383 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2915 : ;
2916 132834 : else if (reduc_idx == -1)
2917 124424 : reduc_idx = i;
2918 : else
2919 : /* For .COND_* reduction operations the else value can be the
2920 : same as one of the operation operands. The other def
2921 : stmts have been moved, so we can't check easily. Check
2922 : it's a call at least. */
2923 8410 : gcc_assert (is_a <gcall *> (stmt_info->stmt));
2924 : }
2925 :
2926 : /* When we have a masked load with uniform mask discover this
2927 : as a single-lane mask with a splat permute. This way we can
2928 : recognize this as a masked load-lane by stripping the splat. */
2929 3002801 : if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
2930 57410 : && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
2931 : IFN_MASK_LOAD)
2932 6075 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2933 3002878 : && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
2934 : {
2935 35 : vec<stmt_vec_info> def_stmts2;
2936 35 : def_stmts2.create (1);
2937 35 : def_stmts2.quick_push (oprnd_info->def_stmts[0]);
2938 35 : child = vect_build_slp_tree (vinfo, def_stmts2, 1,
2939 : &this_max_nunits,
2940 : matches, limit,
2941 : &this_tree_size, bst_map);
2942 35 : if (child)
2943 : {
2944 35 : slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
2945 35 : SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
2946 35 : SLP_TREE_LANES (pnode) = group_size;
2947 35 : SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
2948 35 : SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
2949 210 : for (unsigned k = 0; k < group_size; ++k)
2950 : {
2951 175 : SLP_TREE_SCALAR_STMTS (pnode)
2952 175 : .quick_push (oprnd_info->def_stmts[0]);
2953 175 : SLP_TREE_LANE_PERMUTATION (pnode)
2954 175 : .quick_push (std::make_pair (0u, 0u));
2955 : }
2956 35 : SLP_TREE_CHILDREN (pnode).quick_push (child);
2957 35 : pnode->max_nunits = child->max_nunits;
2958 35 : children.safe_push (pnode);
2959 35 : oprnd_info->def_stmts = vNULL;
2960 35 : continue;
2961 35 : }
2962 : else
2963 0 : def_stmts2.release ();
2964 : }
2965 :
2966 3002766 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2967 : group_size, &this_max_nunits,
2968 : matches, limit,
2969 : &this_tree_size, bst_map)) != NULL)
2970 : {
2971 2521635 : oprnd_info->def_stmts = vNULL;
2972 2521635 : children.safe_push (child);
2973 2521635 : continue;
2974 : }
2975 :
2976 : /* If the SLP build for operand zero failed and operand zero
2977 : and one can be commutated try that for the scalar stmts
2978 : that failed the match. */
2979 481131 : if (i == 0
2980 : /* A first scalar stmt mismatch signals a fatal mismatch. */
2981 379660 : && matches[0]
2982 : /* ??? For COND_EXPRs we can swap the comparison operands
2983 : as well as the arms under some constraints. */
2984 179663 : && (nops == 2 || nops == 3)
2985 108956 : && oprnds_info[1]->first_dt == vect_internal_def
2986 59535 : && (is_gimple_assign (stmt_info->stmt)
2987 11509 : || is_gimple_call (stmt_info->stmt))
2988 : /* Swapping operands for reductions breaks assumptions later on. */
2989 529170 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
2990 : {
2991 : /* See whether we can swap the matching or the non-matching
2992 : stmt operands. */
2993 : bool swap_not_matching = true;
2994 51964 : do
2995 : {
2996 7058485 : for (j = 0; j < group_size; ++j)
2997 : {
2998 7020523 : if (matches[j] != !swap_not_matching)
2999 70756 : continue;
3000 6949767 : stmt_vec_info stmt_info = stmts[j];
3001 : /* Verify if we can swap operands of this stmt. */
3002 6949767 : if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
3003 : {
3004 6949741 : tree_code code = gimple_assign_rhs_code (stmt);
3005 6949741 : if (! commutative_tree_code (code)
3006 6949741 : && ! commutative_ternary_tree_code (code))
3007 : {
3008 13978 : if (!swap_not_matching)
3009 6464 : goto fail;
3010 : swap_not_matching = false;
3011 : break;
3012 : }
3013 : }
3014 7006547 : else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
3015 : {
3016 26 : internal_fn fn = (gimple_call_internal_p (call)
3017 26 : ? gimple_call_internal_fn (call)
3018 : : IFN_LAST);
3019 26 : if ((! commutative_binary_fn_p (fn)
3020 26 : && ! commutative_ternary_fn_p (fn))
3021 28 : || first_commutative_argument (fn) != 0)
3022 : {
3023 24 : if (!swap_not_matching)
3024 12 : goto fail;
3025 : swap_not_matching = false;
3026 : break;
3027 : }
3028 : }
3029 : }
3030 : }
3031 45488 : while (j != group_size);
3032 :
3033 : /* Swap mismatched definition stmts. */
3034 37962 : if (dump_enabled_p ())
3035 351 : dump_printf_loc (MSG_NOTE, vect_location,
3036 : "Re-trying with swapped operands of stmts ");
3037 7036092 : for (j = 0; j < group_size; ++j)
3038 6998130 : if (matches[j] == !swap_not_matching)
3039 : {
3040 13871162 : std::swap (oprnds_info[0]->def_stmts[j],
3041 6935581 : oprnds_info[1]->def_stmts[j]);
3042 13871162 : std::swap (oprnds_info[0]->ops[j],
3043 6935581 : oprnds_info[1]->ops[j]);
3044 6935581 : if (dump_enabled_p ())
3045 956 : dump_printf (MSG_NOTE, "%d ", j);
3046 : }
3047 37962 : if (dump_enabled_p ())
3048 351 : dump_printf (MSG_NOTE, "\n");
3049 : /* After swapping some operands we lost track whether an
3050 : operand has any pattern defs so be conservative here. */
3051 72640 : if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
3052 3330 : oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
3053 : /* And try again with scratch 'matches' ... */
3054 37962 : bool *tem = XALLOCAVEC (bool, group_size);
3055 37962 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
3056 : group_size, &this_max_nunits,
3057 : tem, limit,
3058 : &this_tree_size, bst_map)) != NULL)
3059 : {
3060 6658 : oprnd_info->def_stmts = vNULL;
3061 6658 : children.safe_push (child);
3062 6658 : continue;
3063 : }
3064 : }
3065 474473 : fail:
3066 :
3067 : /* If the SLP build failed and we analyze a basic-block
3068 : simply treat nodes we fail to build as externally defined
3069 : (and thus build vectors from the scalar defs).
3070 : The cost model will reject outright expensive cases.
3071 : ??? This doesn't treat cases where permutation ultimatively
3072 : fails (or we don't try permutation below). Ideally we'd
3073 : even compute a permutation that will end up with the maximum
3074 : SLP tree size... */
3075 474473 : if (is_a <bb_vec_info> (vinfo)
3076 : /* ??? Rejecting patterns this way doesn't work. We'd have to
3077 : do extra work to cancel the pattern so the uses see the
3078 : scalar version. */
3079 394179 : && !is_pattern_stmt_p (stmt_info)
3080 844423 : && !oprnd_info->any_pattern)
3081 : {
3082 : /* But if there's a leading vector sized set of matching stmts
3083 : fail here so we can split the group. This matches the condition
3084 : vect_analyze_slp_instance uses. */
3085 : /* ??? We might want to split here and combine the results to support
3086 : multiple vector sizes better. */
3087 580534 : for (j = 0; j < group_size; ++j)
3088 580534 : if (!matches[j])
3089 : break;
3090 369689 : if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
3091 369660 : && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
3092 : {
3093 359468 : if (dump_enabled_p ())
3094 555 : dump_printf_loc (MSG_NOTE, vect_location,
3095 : "Building vector operands from scalars\n");
3096 359468 : this_tree_size++;
3097 359468 : child = vect_create_new_slp_node (oprnd_info->ops);
3098 359468 : children.safe_push (child);
3099 359468 : oprnd_info->ops = vNULL;
3100 359468 : continue;
3101 : }
3102 : }
3103 :
3104 115005 : gcc_assert (child == NULL);
3105 131090 : FOR_EACH_VEC_ELT (children, j, child)
3106 16085 : if (child)
3107 16085 : vect_free_slp_tree (child);
3108 115005 : vect_free_oprnd_info (oprnds_info);
3109 115005 : return NULL;
3110 : }
3111 :
3112 2981819 : vect_free_oprnd_info (oprnds_info);
3113 :
3114 : /* If we have all children of a child built up from uniform scalars
3115 : or does more than one possibly expensive vector construction then
3116 : just throw that away, causing it built up from scalars.
3117 : The exception is the SLP node for the vector store. */
3118 2981819 : if (is_a <bb_vec_info> (vinfo)
3119 1090467 : && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3120 : /* ??? Rejecting patterns this way doesn't work. We'd have to
3121 : do extra work to cancel the pattern so the uses see the
3122 : scalar version. */
3123 3414983 : && !is_pattern_stmt_p (stmt_info))
3124 : {
3125 : slp_tree child;
3126 : unsigned j;
3127 : bool all_uniform_p = true;
3128 : unsigned n_vector_builds = 0;
3129 1228609 : FOR_EACH_VEC_ELT (children, j, child)
3130 : {
3131 821293 : if (!child)
3132 : ;
3133 821293 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3134 : all_uniform_p = false;
3135 585963 : else if (!vect_slp_tree_uniform_p (child))
3136 : {
3137 446030 : all_uniform_p = false;
3138 446030 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
3139 411902 : n_vector_builds++;
3140 : }
3141 : }
3142 407316 : if (all_uniform_p
3143 407316 : || n_vector_builds > 1
3144 691662 : || (n_vector_builds == children.length ()
3145 30145 : && is_a <gphi *> (stmt_info->stmt)))
3146 : {
3147 : /* Roll back. */
3148 127783 : matches[0] = false;
3149 405909 : FOR_EACH_VEC_ELT (children, j, child)
3150 278126 : if (child)
3151 278126 : vect_free_slp_tree (child);
3152 :
3153 127783 : if (dump_enabled_p ())
3154 177 : dump_printf_loc (MSG_NOTE, vect_location,
3155 : "Building parent vector operands from "
3156 : "scalars instead\n");
3157 127783 : return NULL;
3158 : }
3159 : }
3160 :
3161 2854036 : *tree_size += this_tree_size + 1;
3162 2854036 : *max_nunits = this_max_nunits;
3163 :
3164 2854036 : if (two_operators)
3165 : {
3166 : /* ??? We'd likely want to either cache in bst_map sth like
3167 : { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
3168 : the true { a+b, a+b, a+b, a+b } ... but there we don't have
3169 : explicit stmts to put in so the keying on 'stmts' doesn't
3170 : work (but we have the same issue with nodes that use 'ops'). */
3171 :
3172 6844 : if (has_two_operators_perm)
3173 : {
3174 40 : slp_tree child = children[0];
3175 40 : children.truncate (0);
3176 120 : for (i = 0; i < 2; i++)
3177 : {
3178 80 : slp_tree pnode
3179 80 : = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
3180 80 : SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
3181 80 : SLP_TREE_VECTYPE (pnode) = vectype;
3182 80 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3183 80 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3184 80 : lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
3185 80 : children.safe_push (pnode);
3186 :
3187 656 : for (unsigned j = 0; j < stmts.length (); j++)
3188 576 : perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
3189 : }
3190 :
3191 40 : SLP_TREE_REF_COUNT (child) += 4;
3192 : }
3193 :
3194 6844 : slp_tree one = new _slp_tree;
3195 6844 : slp_tree two = new _slp_tree;
3196 6844 : SLP_TREE_DEF_TYPE (one) = vect_internal_def;
3197 6844 : SLP_TREE_DEF_TYPE (two) = vect_internal_def;
3198 6844 : SLP_TREE_VECTYPE (one) = vectype;
3199 6844 : SLP_TREE_VECTYPE (two) = vectype;
3200 6844 : SLP_TREE_CHILDREN (one).safe_splice (children);
3201 6844 : SLP_TREE_CHILDREN (two).safe_splice (children);
3202 6844 : slp_tree child;
3203 27378 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
3204 13690 : SLP_TREE_REF_COUNT (child)++;
3205 :
3206 : /* Here we record the original defs since this
3207 : node represents the final lane configuration. */
3208 6844 : node = vect_create_new_slp_node (node, stmts, 2);
3209 6844 : SLP_TREE_VECTYPE (node) = vectype;
3210 6844 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3211 6844 : SLP_TREE_CHILDREN (node).quick_push (one);
3212 6844 : SLP_TREE_CHILDREN (node).quick_push (two);
3213 6844 : enum tree_code code0 = ERROR_MARK;
3214 6844 : enum tree_code ocode = ERROR_MARK;
3215 6844 : if (gassign *stmt = dyn_cast <gassign *> (stmts[0]->stmt))
3216 6842 : code0 = gimple_assign_rhs_code (stmt);
3217 6844 : stmt_vec_info ostmt_info;
3218 6844 : unsigned j = 0;
3219 25005 : FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
3220 : {
3221 18161 : int op = 0;
3222 18161 : if (gassign *ostmt = dyn_cast <gassign *> (ostmt_info->stmt))
3223 : {
3224 18157 : if (gimple_assign_rhs_code (ostmt) != code0)
3225 : {
3226 9113 : ocode = gimple_assign_rhs_code (ostmt);
3227 : op = 1;
3228 : j = i;
3229 : }
3230 : }
3231 : else
3232 : {
3233 8 : if (gimple_call_combined_fn (stmts[0]->stmt)
3234 4 : != gimple_call_combined_fn (ostmt_info->stmt))
3235 : {
3236 2 : op = 1;
3237 2 : j = i;
3238 : }
3239 : }
3240 18161 : SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (op, i));
3241 : }
3242 6844 : SLP_TREE_CODE (one) = code0;
3243 6844 : SLP_TREE_CODE (two) = ocode;
3244 6844 : SLP_TREE_LANES (one) = stmts.length ();
3245 6844 : SLP_TREE_LANES (two) = stmts.length ();
3246 6844 : SLP_TREE_REPRESENTATIVE (one) = stmts[0];
3247 6844 : SLP_TREE_REPRESENTATIVE (two) = stmts[j];
3248 :
3249 6844 : return node;
3250 : }
3251 :
3252 2847192 : node = vect_create_new_slp_node (node, stmts, nops);
3253 2847192 : SLP_TREE_VECTYPE (node) = vectype;
3254 2847192 : SLP_TREE_CHILDREN (node).splice (children);
3255 2847192 : SLP_TREE_GS_SCALE (node) = gs_scale;
3256 2847192 : SLP_TREE_GS_BASE (node) = gs_base;
3257 2847192 : if (reduc_idx != -1)
3258 : {
3259 116063 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1
3260 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
3261 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def);
3262 116063 : SLP_TREE_REDUC_IDX (node) = reduc_idx;
3263 116063 : node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id;
3264 : }
3265 : /* When reaching the reduction PHI, create a vect_reduc_info. */
3266 2731129 : else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3267 2731129 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3268 2731129 : && is_a <gphi *> (STMT_VINFO_STMT (stmt_info)))
3269 : {
3270 101394 : loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo);
3271 101394 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1);
3272 101394 : node->cycle_info.id = loop_vinfo->reduc_infos.length ();
3273 101394 : vect_reduc_info reduc_info = new vect_reduc_info_s ();
3274 101394 : loop_vinfo->reduc_infos.safe_push (reduc_info);
3275 101394 : stmt_vec_info reduc_phi = stmt_info;
3276 : /* ??? For double reductions vect_is_simple_reduction stores the
3277 : reduction type and code on the inner loop header PHI. */
3278 101394 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3279 : {
3280 378 : use_operand_p use_p;
3281 378 : gimple *use_stmt;
3282 378 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
3283 : &use_p, &use_stmt);
3284 378 : gcc_assert (res);
3285 378 : reduc_phi = loop_vinfo->lookup_stmt (use_stmt);
3286 : }
3287 101394 : VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info);
3288 101394 : VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi);
3289 101394 : VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi);
3290 101394 : VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
3291 : }
3292 : return node;
3293 9290472 : }
3294 :
3295 : /* Dump a single SLP tree NODE. */
3296 :
3297 : static void
3298 444202 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
3299 : slp_tree node)
3300 : {
3301 444202 : unsigned i, j;
3302 444202 : slp_tree child;
3303 444202 : stmt_vec_info stmt_info;
3304 444202 : tree op;
3305 :
3306 444202 : dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
3307 444202 : dump_user_location_t user_loc = loc.get_user_location ();
3308 444202 : dump_printf_loc (metadata, user_loc,
3309 : "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
3310 : ", refcnt=%u)",
3311 444202 : SLP_TREE_DEF_TYPE (node) == vect_external_def
3312 : ? " (external)"
3313 : : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
3314 428559 : ? " (constant)"
3315 : : ""), (void *) node,
3316 444202 : estimated_poly_value (node->max_nunits),
3317 : SLP_TREE_REF_COUNT (node));
3318 444202 : if (SLP_TREE_VECTYPE (node))
3319 376675 : dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
3320 444202 : dump_printf (metadata, "%s",
3321 444202 : node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
3322 444202 : if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1)
3323 23821 : dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id,
3324 : node->cycle_info.reduc_idx);
3325 444202 : dump_printf (metadata, "\n");
3326 444202 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3327 : {
3328 361687 : if (SLP_TREE_PERMUTE_P (node))
3329 13668 : dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
3330 : else
3331 348019 : dump_printf_loc (metadata, user_loc, "op template: %G",
3332 348019 : SLP_TREE_REPRESENTATIVE (node)->stmt);
3333 : }
3334 444202 : if (SLP_TREE_SCALAR_STMTS (node).exists ())
3335 865553 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3336 511969 : if (stmt_info)
3337 506688 : dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
3338 506688 : SLP_TREE_LIVE_LANES (node).contains (i)
3339 503079 : ? "[l*]" : (STMT_VINFO_LIVE_P (stmt_info)
3340 503079 : ? "[l] " : ""),
3341 : i, stmt_info->stmt);
3342 : else
3343 5281 : dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
3344 : else
3345 : {
3346 90618 : dump_printf_loc (metadata, user_loc, "\t{ ");
3347 199537 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
3348 108919 : dump_printf (metadata, "%T%s ", op,
3349 108919 : i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
3350 90618 : dump_printf (metadata, "}\n");
3351 : }
3352 444202 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3353 : {
3354 64658 : dump_printf_loc (metadata, user_loc, "\tload permutation {");
3355 147426 : FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
3356 82768 : dump_printf (dump_kind, " %u", j);
3357 64658 : dump_printf (dump_kind, " }\n");
3358 : }
3359 444202 : if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3360 : {
3361 13676 : dump_printf_loc (metadata, user_loc, "\tlane permutation {");
3362 51245 : for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
3363 37569 : dump_printf (dump_kind, " %u[%u]",
3364 37569 : SLP_TREE_LANE_PERMUTATION (node)[i].first,
3365 37569 : SLP_TREE_LANE_PERMUTATION (node)[i].second);
3366 13676 : dump_printf (dump_kind, " }%s\n",
3367 13676 : node->ldst_lanes ? " (load-lanes)" : "");
3368 : }
3369 444202 : if (SLP_TREE_CHILDREN (node).is_empty ())
3370 169327 : return;
3371 274875 : dump_printf_loc (metadata, user_loc, "\tchildren");
3372 725141 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3373 450266 : dump_printf (dump_kind, " %p", (void *)child);
3374 274875 : dump_printf (dump_kind, "%s\n",
3375 274875 : node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
3376 : ? " (store-lanes)" : "");
3377 : }
3378 :
3379 : DEBUG_FUNCTION void
3380 0 : debug (slp_tree node)
3381 : {
3382 0 : debug_dump_context ctx;
3383 0 : vect_print_slp_tree (MSG_NOTE,
3384 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3385 : node);
3386 0 : }
3387 :
3388 : /* Recursive helper for the dot producer below. */
3389 :
3390 : static void
3391 0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
3392 : {
3393 0 : if (visited.add (node))
3394 : return;
3395 :
3396 0 : fprintf (f, "\"%p\" [label=\"", (void *)node);
3397 0 : vect_print_slp_tree (MSG_NOTE,
3398 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3399 : node);
3400 0 : fprintf (f, "\"];\n");
3401 :
3402 :
3403 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3404 0 : fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
3405 :
3406 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3407 0 : if (child)
3408 0 : dot_slp_tree (f, child, visited);
3409 : }
3410 :
3411 : DEBUG_FUNCTION void
3412 0 : dot_slp_tree (const char *fname, slp_tree node)
3413 : {
3414 0 : FILE *f = fopen (fname, "w");
3415 0 : fprintf (f, "digraph {\n");
3416 0 : fflush (f);
3417 0 : {
3418 0 : debug_dump_context ctx (f);
3419 0 : hash_set<slp_tree> visited;
3420 0 : dot_slp_tree (f, node, visited);
3421 0 : }
3422 0 : fflush (f);
3423 0 : fprintf (f, "}\n");
3424 0 : fclose (f);
3425 0 : }
3426 :
3427 : DEBUG_FUNCTION void
3428 0 : dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3429 : {
3430 0 : FILE *f = fopen (fname, "w");
3431 0 : fprintf (f, "digraph {\n");
3432 0 : fflush (f);
3433 0 : {
3434 0 : debug_dump_context ctx (f);
3435 0 : hash_set<slp_tree> visited;
3436 0 : for (auto inst : slp_instances)
3437 0 : dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3438 0 : }
3439 0 : fflush (f);
3440 0 : fprintf (f, "}\n");
3441 0 : fclose (f);
3442 0 : }
3443 :
3444 : /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
3445 :
3446 : static void
3447 482917 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3448 : slp_tree node, hash_set<slp_tree> &visited)
3449 : {
3450 482917 : unsigned i;
3451 482917 : slp_tree child;
3452 :
3453 482917 : if (visited.add (node))
3454 482917 : return;
3455 :
3456 443728 : vect_print_slp_tree (dump_kind, loc, node);
3457 :
3458 1337208 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3459 449752 : if (child)
3460 407129 : vect_print_slp_graph (dump_kind, loc, child, visited);
3461 : }
3462 :
3463 : static void
3464 46525 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3465 : slp_tree entry)
3466 : {
3467 46525 : hash_set<slp_tree> visited;
3468 46525 : vect_print_slp_graph (dump_kind, loc, entry, visited);
3469 46525 : }
3470 :
3471 : DEBUG_FUNCTION void
3472 0 : debug (slp_instance instance)
3473 : {
3474 0 : debug_dump_context ctx;
3475 0 : vect_print_slp_graph (MSG_NOTE,
3476 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3477 : SLP_INSTANCE_TREE (instance));
3478 0 : }
3479 :
3480 :
3481 : /* Compute the set of scalar stmts participating in external nodes. */
3482 :
3483 : static void
3484 1554795 : vect_slp_gather_extern_scalar_stmts (vec_info *vinfo, slp_tree node,
3485 : hash_set<slp_tree> &visited,
3486 : hash_set<stmt_vec_info> &estmts)
3487 : {
3488 1554795 : if (visited.add (node))
3489 : return;
3490 :
3491 1511715 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3492 : {
3493 : slp_tree child;
3494 : int i;
3495 1745788 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3496 877785 : if (child)
3497 877785 : vect_slp_gather_extern_scalar_stmts (vinfo, child, visited, estmts);
3498 : }
3499 : else
3500 3623780 : for (tree def : SLP_TREE_SCALAR_OPS (node))
3501 : {
3502 1694188 : stmt_vec_info def_stmt = vinfo->lookup_def (def);
3503 1694188 : if (def_stmt)
3504 336717 : estmts.add (def_stmt);
3505 : }
3506 : }
3507 :
3508 : /* Mark the original scalar stmt coverage of the vector SLP graph of VINFO
3509 : with STMT_SLP_TYPE == pure_slp. */
3510 :
3511 : static void
3512 234604 : vect_bb_slp_mark_stmts_vectorized (bb_vec_info vinfo)
3513 : {
3514 : /* Gather the scalar stmt leafs of the SLP graph to stop the below DFS
3515 : walk on. */
3516 234604 : hash_set<stmt_vec_info> scalar_stmts_in_externs;
3517 234604 : hash_set<slp_tree> visited;
3518 1380822 : for (auto instance : BB_VINFO_SLP_INSTANCES (vinfo))
3519 677010 : vect_slp_gather_extern_scalar_stmts (vinfo, SLP_INSTANCE_TREE (instance),
3520 : visited, scalar_stmts_in_externs);
3521 :
3522 : /* DFS walk scalar stmts to compute the vectorized coverage indicated
3523 : by STMT_SLP_TYPE (stmt) == pure_slp on the original scalar (non-pattern)
3524 : stmts. */
3525 1380822 : for (auto instance : BB_VINFO_SLP_INSTANCES (vinfo))
3526 : {
3527 787343 : for (auto stmt : SLP_INSTANCE_ROOT_STMTS (instance))
3528 52639 : if (!scalar_stmts_in_externs.contains (stmt))
3529 51941 : STMT_SLP_TYPE (stmt) = pure_slp;
3530 677010 : auto_vec<stmt_vec_info> worklist;
3531 3826724 : for (auto stmt : SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)))
3532 : {
3533 1795694 : stmt = vect_orig_stmt (stmt);
3534 1795694 : if (!scalar_stmts_in_externs.contains (stmt)
3535 1795694 : && STMT_SLP_TYPE (stmt) != pure_slp)
3536 : {
3537 1786625 : STMT_SLP_TYPE (stmt) = pure_slp;
3538 1786625 : worklist.safe_push (stmt);
3539 : }
3540 : }
3541 3575003 : while (!worklist.is_empty ())
3542 : {
3543 2223605 : stmt_vec_info stmt = worklist.pop ();
3544 :
3545 : /* Now walk relevant parts of the SSA use-def graph. */
3546 2223605 : slp_oprnds child_ops (stmt);
3547 4682621 : for (unsigned i = 0; i < child_ops.num_slp_children; ++i)
3548 : {
3549 2459016 : tree op = child_ops.get_op_for_slp_child (stmt, i);
3550 2459016 : stmt_vec_info def = vinfo->lookup_def (op);
3551 2459016 : if (def
3552 851073 : && !scalar_stmts_in_externs.contains (def)
3553 2977281 : && STMT_SLP_TYPE (def) != pure_slp)
3554 : {
3555 436980 : STMT_SLP_TYPE (def) = pure_slp;
3556 436980 : worklist.safe_push (def);
3557 : }
3558 : }
3559 : }
3560 677010 : }
3561 234604 : }
3562 :
3563 : /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
3564 :
3565 : static void
3566 2488726 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3567 : {
3568 2488726 : int i;
3569 2488726 : stmt_vec_info stmt_info;
3570 2488726 : slp_tree child;
3571 :
3572 2488726 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3573 : return;
3574 :
3575 1489676 : if (visited.add (node))
3576 : return;
3577 :
3578 4432615 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3579 3088710 : if (stmt_info)
3580 : {
3581 3088710 : gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3582 : || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3583 3088710 : STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3584 : }
3585 :
3586 3056060 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3587 1712155 : if (child)
3588 1712155 : vect_mark_slp_stmts_relevant (child, visited);
3589 : }
3590 :
3591 : static void
3592 776571 : vect_mark_slp_stmts_relevant (slp_tree node)
3593 : {
3594 776571 : hash_set<slp_tree> visited;
3595 776571 : vect_mark_slp_stmts_relevant (node, visited);
3596 776571 : }
3597 :
3598 :
3599 : /* Gather loads in the SLP graph NODE and populate the INST loads array. */
3600 :
3601 : static void
3602 10551353 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3603 : hash_set<slp_tree> &visited)
3604 : {
3605 10551353 : if (!node || visited.add (node))
3606 1736943 : return;
3607 :
3608 8814410 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3609 : return;
3610 :
3611 6532066 : if (!SLP_TREE_PERMUTE_P (node))
3612 : {
3613 6325344 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3614 6325344 : if (STMT_VINFO_DATA_REF (stmt_info)
3615 2740210 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3616 1546854 : loads.safe_push (node);
3617 : }
3618 :
3619 : unsigned i;
3620 : slp_tree child;
3621 14874948 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3622 8342882 : vect_gather_slp_loads (loads, child, visited);
3623 : }
3624 :
3625 :
3626 : /* Find the last store in SLP INSTANCE. */
3627 :
3628 : stmt_vec_info
3629 2718181 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
3630 : {
3631 2718181 : stmt_vec_info last = NULL;
3632 2718181 : stmt_vec_info stmt_vinfo;
3633 :
3634 9911769 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3635 7193588 : if (stmt_vinfo)
3636 : {
3637 7193588 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3638 7193588 : last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3639 : }
3640 :
3641 2718181 : return last;
3642 : }
3643 :
3644 : /* Find the first stmt in NODE. */
3645 :
3646 : stmt_vec_info
3647 530660 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
3648 : {
3649 530660 : stmt_vec_info first = NULL;
3650 530660 : stmt_vec_info stmt_vinfo;
3651 :
3652 1798300 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3653 1267640 : if (stmt_vinfo)
3654 : {
3655 1264946 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3656 1264946 : if (!first
3657 1264946 : || get_later_stmt (stmt_vinfo, first) == first)
3658 : first = stmt_vinfo;
3659 : }
3660 :
3661 530660 : return first;
3662 : }
3663 :
3664 : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3665 : two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3666 : (also containing the first GROUP1_SIZE stmts, since stores are
3667 : consecutive), the second containing the remainder.
3668 : Return the first stmt in the second group. */
3669 :
3670 : static stmt_vec_info
3671 156061 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3672 : {
3673 156061 : gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3674 156061 : gcc_assert (group1_size > 0);
3675 156061 : int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3676 156061 : gcc_assert (group2_size > 0);
3677 156061 : DR_GROUP_SIZE (first_vinfo) = group1_size;
3678 :
3679 156061 : stmt_vec_info stmt_info = first_vinfo;
3680 523166 : for (unsigned i = group1_size; i > 1; i--)
3681 : {
3682 367105 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3683 367105 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3684 : }
3685 : /* STMT is now the last element of the first group. */
3686 156061 : stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3687 156061 : DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3688 :
3689 156061 : DR_GROUP_SIZE (group2) = group2_size;
3690 436482 : for (stmt_info = group2; stmt_info;
3691 280421 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3692 : {
3693 280421 : DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3694 280421 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3695 : }
3696 :
3697 : /* For the second group, the DR_GROUP_GAP is that before the original group,
3698 : plus skipping over the first vector. */
3699 156061 : DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3700 :
3701 : /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3702 156061 : DR_GROUP_GAP (first_vinfo) += group2_size;
3703 :
3704 156061 : if (dump_enabled_p ())
3705 61 : dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3706 : group1_size, group2_size);
3707 :
3708 156061 : return group2;
3709 : }
3710 :
3711 : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3712 : statements and a vector of NUNITS elements. */
3713 :
3714 : static poly_uint64
3715 4134296 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3716 : {
3717 4134296 : return exact_div (common_multiple (nunits, group_size), group_size);
3718 : }
3719 :
3720 : /* Helper that checks to see if a node is a load node. */
3721 :
3722 : static inline bool
3723 108 : vect_is_slp_load_node (slp_tree root)
3724 : {
3725 108 : return (!SLP_TREE_PERMUTE_P (root)
3726 108 : && SLP_TREE_DEF_TYPE (root) == vect_internal_def
3727 102 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3728 172 : && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
3729 : }
3730 :
3731 :
3732 : /* Helper function of optimize_load_redistribution that performs the operation
3733 : recursively. */
3734 :
3735 : static slp_tree
3736 20434 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3737 : vec_info *vinfo, unsigned int group_size,
3738 : hash_map<slp_tree, slp_tree> *load_map,
3739 : slp_tree root)
3740 : {
3741 20434 : if (slp_tree *leader = load_map->get (root))
3742 3669 : return *leader;
3743 :
3744 16765 : slp_tree node;
3745 16765 : unsigned i;
3746 :
3747 : /* For now, we don't know anything about externals so do not do anything. */
3748 16765 : if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3749 : return NULL;
3750 12385 : else if (SLP_TREE_PERMUTE_P (root))
3751 : {
3752 : /* First convert this node into a load node and add it to the leaves
3753 : list and flatten the permute from a lane to a load one. If it's
3754 : unneeded it will be elided later. */
3755 76 : vec<stmt_vec_info> stmts;
3756 76 : stmts.create (SLP_TREE_LANES (root));
3757 76 : lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3758 140 : for (unsigned j = 0; j < lane_perm.length (); j++)
3759 : {
3760 108 : std::pair<unsigned, unsigned> perm = lane_perm[j];
3761 108 : node = SLP_TREE_CHILDREN (root)[perm.first];
3762 :
3763 108 : if (!vect_is_slp_load_node (node)
3764 108 : || SLP_TREE_CHILDREN (node).exists ())
3765 : {
3766 44 : stmts.release ();
3767 44 : goto next;
3768 : }
3769 :
3770 64 : stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3771 : }
3772 :
3773 32 : if (dump_enabled_p ())
3774 0 : dump_printf_loc (MSG_NOTE, vect_location,
3775 : "converting stmts on permute node %p\n",
3776 : (void *) root);
3777 :
3778 32 : bool *matches = XALLOCAVEC (bool, group_size);
3779 32 : poly_uint64 max_nunits = 1;
3780 32 : unsigned tree_size = 0, limit = 1;
3781 32 : node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3782 : matches, &limit, &tree_size, bst_map);
3783 32 : if (!node)
3784 0 : stmts.release ();
3785 :
3786 32 : load_map->put (root, node);
3787 32 : return node;
3788 : }
3789 :
3790 12309 : next:
3791 12353 : load_map->put (root, NULL);
3792 :
3793 29030 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3794 : {
3795 16677 : slp_tree value
3796 16677 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3797 : node);
3798 16677 : if (value)
3799 : {
3800 32 : SLP_TREE_REF_COUNT (value)++;
3801 32 : SLP_TREE_CHILDREN (root)[i] = value;
3802 : /* ??? We know the original leafs of the replaced nodes will
3803 : be referenced by bst_map, only the permutes created by
3804 : pattern matching are not. */
3805 32 : if (SLP_TREE_REF_COUNT (node) == 1)
3806 32 : load_map->remove (node);
3807 32 : vect_free_slp_tree (node);
3808 : }
3809 : }
3810 :
3811 : return NULL;
3812 : }
3813 :
3814 : /* Temporary workaround for loads not being CSEd during SLP build. This
3815 : function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3816 : VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3817 : same DR such that the final operation is equal to a permuted load. Such
3818 : NODES are then directly converted into LOADS themselves. The nodes are
3819 : CSEd using BST_MAP. */
3820 :
3821 : static void
3822 2851 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3823 : vec_info *vinfo, unsigned int group_size,
3824 : hash_map<slp_tree, slp_tree> *load_map,
3825 : slp_tree root)
3826 : {
3827 2851 : slp_tree node;
3828 2851 : unsigned i;
3829 :
3830 6608 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3831 : {
3832 3757 : slp_tree value
3833 3757 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3834 : node);
3835 3757 : if (value)
3836 : {
3837 0 : SLP_TREE_REF_COUNT (value)++;
3838 0 : SLP_TREE_CHILDREN (root)[i] = value;
3839 : /* ??? We know the original leafs of the replaced nodes will
3840 : be referenced by bst_map, only the permutes created by
3841 : pattern matching are not. */
3842 0 : if (SLP_TREE_REF_COUNT (node) == 1)
3843 0 : load_map->remove (node);
3844 0 : vect_free_slp_tree (node);
3845 : }
3846 : }
3847 2851 : }
3848 :
3849 : /* Helper function of vect_match_slp_patterns.
3850 :
3851 : Attempts to match patterns against the slp tree rooted in REF_NODE using
3852 : VINFO. Patterns are matched in post-order traversal.
3853 :
3854 : If matching is successful the value in REF_NODE is updated and returned, if
3855 : not then it is returned unchanged. */
3856 :
3857 : static bool
3858 6082109 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3859 : slp_tree_to_load_perm_map_t *perm_cache,
3860 : slp_compat_nodes_map_t *compat_cache,
3861 : hash_set<slp_tree> *visited)
3862 : {
3863 6082109 : unsigned i;
3864 6082109 : slp_tree node = *ref_node;
3865 6082109 : bool found_p = false;
3866 6082109 : if (!node || visited->add (node))
3867 868815 : return false;
3868 :
3869 : slp_tree child;
3870 9755493 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3871 4542199 : found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3872 : vinfo, perm_cache, compat_cache,
3873 : visited);
3874 :
3875 15639882 : for (unsigned x = 0; x < num__slp_patterns; x++)
3876 : {
3877 10426588 : vect_pattern *pattern
3878 10426588 : = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3879 10426588 : if (pattern)
3880 : {
3881 1171 : pattern->build (vinfo);
3882 1171 : delete pattern;
3883 1171 : found_p = true;
3884 : }
3885 : }
3886 :
3887 : return found_p;
3888 : }
3889 :
3890 : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3891 : vec_info VINFO.
3892 :
3893 : The modified tree is returned. Patterns are tried in order and multiple
3894 : patterns may match. */
3895 :
3896 : static bool
3897 1539910 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3898 : hash_set<slp_tree> *visited,
3899 : slp_tree_to_load_perm_map_t *perm_cache,
3900 : slp_compat_nodes_map_t *compat_cache)
3901 : {
3902 1539910 : DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3903 1539910 : slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3904 :
3905 1539910 : if (dump_enabled_p ())
3906 30427 : dump_printf_loc (MSG_NOTE, vect_location,
3907 : "Analyzing SLP tree %p for patterns\n",
3908 30427 : (void *) SLP_INSTANCE_TREE (instance));
3909 :
3910 1539910 : return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3911 1539910 : visited);
3912 : }
3913 :
3914 : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3915 : vectorizing with VECTYPE that might be NULL. MASKED_P indicates whether
3916 : the stores are masked.
3917 : Return true if we could use IFN_STORE_LANES instead and if that appears
3918 : to be the better approach. */
3919 :
3920 : static bool
3921 5812 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3922 : tree vectype, bool masked_p,
3923 : unsigned int group_size,
3924 : unsigned int new_group_size)
3925 : {
3926 5812 : if (!vectype)
3927 : {
3928 5812 : tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3929 5812 : vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3930 : }
3931 5812 : if (!vectype)
3932 : return false;
3933 : /* Allow the split if one of the two new groups would operate on full
3934 : vectors *within* rather than across one scalar loop iteration.
3935 : This is purely a heuristic, but it should work well for group
3936 : sizes of 3 and 4, where the possible splits are:
3937 :
3938 : 3->2+1: OK if the vector has exactly two elements
3939 : 4->2+2: Likewise
3940 : 4->3+1: Less clear-cut. */
3941 5812 : if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3942 3259 : || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3943 2576 : return false;
3944 3236 : return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
3945 : }
3946 :
3947 : /* Analyze an SLP instance starting from a group of grouped stores. Call
3948 : vect_build_slp_tree to build a tree of packed stmts if possible.
3949 : Return FALSE if it's impossible to SLP any stmt in the loop. */
3950 :
3951 : static bool
3952 : vect_analyze_slp_instance (vec_info *vinfo,
3953 : scalar_stmts_to_slp_tree_map_t *bst_map,
3954 : stmt_vec_info stmt_info, slp_instance_kind kind,
3955 : unsigned max_tree_size, unsigned *limit,
3956 : bool force_single_lane);
3957 :
3958 : /* Build an interleaving scheme for the store sources RHS_NODES from
3959 : SCALAR_STMTS. */
3960 :
3961 : static slp_tree
3962 7712 : vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3963 : vec<stmt_vec_info> &scalar_stmts,
3964 : poly_uint64 max_nunits)
3965 : {
3966 7712 : unsigned int group_size = scalar_stmts.length ();
3967 15424 : slp_tree node = vect_create_new_slp_node (scalar_stmts,
3968 7712 : SLP_TREE_CHILDREN
3969 : (rhs_nodes[0]).length ());
3970 7712 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
3971 7712 : node->max_nunits = max_nunits;
3972 7712 : for (unsigned l = 0;
3973 15451 : l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
3974 : {
3975 : /* And a permute merging all RHS SLP trees. */
3976 7739 : slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
3977 7739 : VEC_PERM_EXPR);
3978 7739 : SLP_TREE_CHILDREN (node).quick_push (perm);
3979 7739 : SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
3980 7739 : SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
3981 7739 : perm->max_nunits = max_nunits;
3982 7739 : SLP_TREE_LANES (perm) = group_size;
3983 : /* ??? We should set this NULL but that's not expected. */
3984 7739 : SLP_TREE_REPRESENTATIVE (perm)
3985 7739 : = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
3986 30312 : for (unsigned j = 0; j < rhs_nodes.length (); ++j)
3987 : {
3988 22573 : SLP_TREE_CHILDREN (perm)
3989 22573 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
3990 22573 : SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
3991 22573 : for (unsigned k = 0;
3992 47478 : k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
3993 : {
3994 : /* ??? We should populate SLP_TREE_SCALAR_STMTS
3995 : or SLP_TREE_SCALAR_OPS but then we might have
3996 : a mix of both in our children. */
3997 24905 : SLP_TREE_LANE_PERMUTATION (perm)
3998 24905 : .quick_push (std::make_pair (j, k));
3999 : }
4000 : }
4001 :
4002 : /* Now we have a single permute node but we cannot code-generate
4003 : the case with more than two inputs.
4004 : Perform pairwise reduction, reducing the two inputs
4005 : with the least number of lanes to one and then repeat until
4006 : we end up with two inputs. That scheme makes sure we end
4007 : up with permutes satisfying the restriction of requiring at
4008 : most two vector inputs to produce a single vector output
4009 : when the number of lanes is even. */
4010 14834 : while (SLP_TREE_CHILDREN (perm).length () > 2)
4011 : {
4012 : /* When we have three equal sized groups left the pairwise
4013 : reduction does not result in a scheme that avoids using
4014 : three vectors. Instead merge the first two groups
4015 : to the final size with do-not-care elements (chosen
4016 : from the first group) and then merge with the third.
4017 : { A0, B0, x, A1, B1, x, ... }
4018 : -> { A0, B0, C0, A1, B1, C1, ... }
4019 : This handles group size of three (and at least
4020 : power-of-two multiples of that). */
4021 7095 : if (SLP_TREE_CHILDREN (perm).length () == 3
4022 3271 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
4023 3271 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
4024 7095 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
4025 2453 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
4026 : {
4027 2147 : int ai = 0;
4028 2147 : int bi = 1;
4029 2147 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
4030 2147 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
4031 2147 : unsigned n = SLP_TREE_LANES (perm);
4032 :
4033 2147 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
4034 2147 : SLP_TREE_LANES (permab) = n;
4035 2147 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
4036 2147 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
4037 2147 : permab->max_nunits = max_nunits;
4038 : /* ??? Should be NULL but that's not expected. */
4039 2147 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
4040 2147 : SLP_TREE_CHILDREN (permab).quick_push (a);
4041 4308 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
4042 2161 : SLP_TREE_LANE_PERMUTATION (permab)
4043 2161 : .quick_push (std::make_pair (0, k));
4044 2147 : SLP_TREE_CHILDREN (permab).quick_push (b);
4045 4308 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
4046 2161 : SLP_TREE_LANE_PERMUTATION (permab)
4047 2161 : .quick_push (std::make_pair (1, k));
4048 : /* Push the do-not-care lanes. */
4049 4308 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
4050 2161 : SLP_TREE_LANE_PERMUTATION (permab)
4051 2161 : .quick_push (std::make_pair (0, k));
4052 :
4053 : /* Put the merged node into 'perm', in place of a. */
4054 2147 : SLP_TREE_CHILDREN (perm)[ai] = permab;
4055 : /* Adjust the references to b in the permutation
4056 : of perm and to the later children which we'll
4057 : remove. */
4058 8630 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
4059 : {
4060 6483 : std::pair<unsigned, unsigned> &p
4061 6483 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
4062 6483 : if (p.first == (unsigned) bi)
4063 : {
4064 2161 : p.first = ai;
4065 2161 : p.second += SLP_TREE_LANES (a);
4066 : }
4067 4322 : else if (p.first > (unsigned) bi)
4068 2161 : p.first--;
4069 : }
4070 2147 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
4071 2147 : break;
4072 : }
4073 :
4074 : /* Pick the two nodes with the least number of lanes,
4075 : prefer the earliest candidate and maintain ai < bi. */
4076 : int ai = -1;
4077 : int bi = -1;
4078 45078 : for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
4079 : {
4080 40130 : if (ai == -1)
4081 4948 : ai = ci;
4082 35182 : else if (bi == -1)
4083 4948 : bi = ci;
4084 30234 : else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
4085 30234 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
4086 30234 : || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
4087 24904 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
4088 : {
4089 11548 : if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
4090 5774 : <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
4091 2687 : bi = ci;
4092 : else
4093 : {
4094 3087 : ai = bi;
4095 3087 : bi = ci;
4096 : }
4097 : }
4098 : }
4099 :
4100 : /* Produce a merge of nodes ai and bi. */
4101 4948 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
4102 4948 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
4103 4948 : unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
4104 4948 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
4105 4948 : SLP_TREE_LANES (permab) = n;
4106 4948 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
4107 4948 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
4108 4948 : permab->max_nunits = max_nunits;
4109 : /* ??? Should be NULL but that's not expected. */
4110 4948 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
4111 4948 : SLP_TREE_CHILDREN (permab).quick_push (a);
4112 13096 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
4113 8148 : SLP_TREE_LANE_PERMUTATION (permab)
4114 8148 : .quick_push (std::make_pair (0, k));
4115 4948 : SLP_TREE_CHILDREN (permab).quick_push (b);
4116 12420 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
4117 7472 : SLP_TREE_LANE_PERMUTATION (permab)
4118 7472 : .quick_push (std::make_pair (1, k));
4119 :
4120 : /* Put the merged node into 'perm', in place of a. */
4121 4948 : SLP_TREE_CHILDREN (perm)[ai] = permab;
4122 : /* Adjust the references to b in the permutation
4123 : of perm and to the later children which we'll
4124 : remove. */
4125 72097 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
4126 : {
4127 67149 : std::pair<unsigned, unsigned> &p
4128 67149 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
4129 67149 : if (p.first == (unsigned) bi)
4130 : {
4131 7472 : p.first = ai;
4132 7472 : p.second += SLP_TREE_LANES (a);
4133 : }
4134 59677 : else if (p.first > (unsigned) bi)
4135 25082 : p.first--;
4136 : }
4137 4948 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
4138 : }
4139 : }
4140 :
4141 7712 : return node;
4142 : }
4143 :
4144 : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
4145 : of KIND. Return true if successful. SCALAR_STMTS is owned by this
4146 : function, REMAIN and ROOT_STMT_INFOS ownership is transferred back to
4147 : the caller upon failure. */
4148 :
4149 : static bool
4150 1869561 : vect_build_slp_instance (vec_info *vinfo,
4151 : slp_instance_kind kind,
4152 : vec<stmt_vec_info> &scalar_stmts,
4153 : vec<stmt_vec_info> &root_stmt_infos,
4154 : vec<tree> &remain,
4155 : unsigned max_tree_size, unsigned *limit,
4156 : scalar_stmts_to_slp_tree_map_t *bst_map,
4157 : bool force_single_lane)
4158 : {
4159 : /* If there's no budget left bail out early. */
4160 1869561 : if (*limit == 0)
4161 : {
4162 27238 : scalar_stmts.release ();
4163 27238 : return false;
4164 : }
4165 :
4166 1842323 : if (kind == slp_inst_kind_ctor)
4167 : {
4168 12824 : if (dump_enabled_p ())
4169 86 : dump_printf_loc (MSG_NOTE, vect_location,
4170 : "Analyzing vectorizable constructor: %G\n",
4171 43 : root_stmt_infos[0]->stmt);
4172 : }
4173 1829499 : else if (kind == slp_inst_kind_gcond)
4174 : {
4175 275635 : if (dump_enabled_p ())
4176 5624 : dump_printf_loc (MSG_NOTE, vect_location,
4177 : "Analyzing vectorizable control flow: %G",
4178 2812 : root_stmt_infos[0]->stmt);
4179 : }
4180 :
4181 1842323 : if (dump_enabled_p ())
4182 : {
4183 25502 : dump_printf_loc (MSG_NOTE, vect_location,
4184 : "Starting SLP discovery for\n");
4185 54440 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4186 57876 : dump_printf_loc (MSG_NOTE, vect_location,
4187 28938 : " %G", scalar_stmts[i]->stmt);
4188 : }
4189 :
4190 : /* Build the tree for the SLP instance. */
4191 1842323 : unsigned int group_size = scalar_stmts.length ();
4192 1842323 : bool *matches = XALLOCAVEC (bool, group_size);
4193 1842323 : poly_uint64 max_nunits = 1;
4194 1842323 : unsigned tree_size = 0;
4195 :
4196 1842323 : slp_tree node = NULL;
4197 1842323 : if (group_size > 1 && force_single_lane)
4198 : {
4199 0 : matches[0] = true;
4200 0 : matches[1] = false;
4201 : }
4202 : else
4203 1842323 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4204 : &max_nunits, matches, limit,
4205 : &tree_size, bst_map);
4206 1842323 : if (node != NULL)
4207 : {
4208 : /* Calculate the unrolling factor based on the smallest type. */
4209 758165 : poly_uint64 unrolling_factor
4210 758165 : = calculate_unrolling_factor (max_nunits, group_size);
4211 :
4212 758165 : if (maybe_ne (unrolling_factor, 1U)
4213 758165 : && is_a <bb_vec_info> (vinfo))
4214 : {
4215 0 : unsigned HOST_WIDE_INT const_max_nunits;
4216 0 : if (!max_nunits.is_constant (&const_max_nunits)
4217 0 : || const_max_nunits > group_size)
4218 : {
4219 0 : if (dump_enabled_p ())
4220 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4221 : "Build SLP failed: store group "
4222 : "size not a multiple of the vector size "
4223 : "in basic block SLP\n");
4224 0 : vect_free_slp_tree (node);
4225 0 : return false;
4226 : }
4227 : /* Fatal mismatch. */
4228 0 : if (dump_enabled_p ())
4229 0 : dump_printf_loc (MSG_NOTE, vect_location,
4230 : "SLP discovery succeeded but node needs "
4231 : "splitting\n");
4232 0 : memset (matches, true, group_size);
4233 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
4234 0 : vect_free_slp_tree (node);
4235 : }
4236 : else
4237 : {
4238 : /* Create a new SLP instance. */
4239 758165 : slp_instance new_instance = XNEW (class _slp_instance);
4240 758165 : SLP_INSTANCE_TREE (new_instance) = node;
4241 758165 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4242 758165 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4243 758165 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4244 758165 : SLP_INSTANCE_KIND (new_instance) = kind;
4245 758165 : new_instance->reduc_phis = NULL;
4246 758165 : new_instance->cost_vec = vNULL;
4247 758165 : new_instance->subgraph_entries = vNULL;
4248 :
4249 758165 : if (dump_enabled_p ())
4250 22441 : dump_printf_loc (MSG_NOTE, vect_location,
4251 : "SLP size %u vs. limit %u.\n",
4252 : tree_size, max_tree_size);
4253 :
4254 758165 : vinfo->slp_instances.safe_push (new_instance);
4255 :
4256 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4257 : the number of scalar stmts in the root in a few places.
4258 : Verify that assumption holds. */
4259 1516330 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4260 : .length () == group_size);
4261 :
4262 758165 : if (dump_enabled_p ())
4263 : {
4264 22441 : if (kind == slp_inst_kind_reduc_group)
4265 1449 : dump_printf_loc (MSG_NOTE, vect_location,
4266 : "SLP discovery of size %d reduction group "
4267 : "succeeded\n", group_size);
4268 22441 : dump_printf_loc (MSG_NOTE, vect_location,
4269 : "Final SLP tree for instance %p:\n",
4270 : (void *) new_instance);
4271 22441 : vect_print_slp_graph (MSG_NOTE, vect_location,
4272 : SLP_INSTANCE_TREE (new_instance));
4273 : }
4274 :
4275 758165 : return true;
4276 : }
4277 : }
4278 : /* Failed to SLP. */
4279 :
4280 : /* While we arrive here even with slp_inst_kind_store we should only
4281 : for group_size == 1. The code to split store groups is only in
4282 : vect_analyze_slp_instance now. */
4283 1084158 : gcc_assert (kind != slp_inst_kind_store || group_size == 1);
4284 :
4285 : /* Free the allocated memory. */
4286 1084158 : scalar_stmts.release ();
4287 :
4288 : /* Failed to SLP. */
4289 1084158 : if (dump_enabled_p ())
4290 3061 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4291 : return false;
4292 : }
4293 :
4294 : /* Analyze an SLP instance starting from a the start of a reduction chain.
4295 : Call vect_build_slp_tree to build a tree of packed stmts if possible.
4296 : Return FALSE if SLP build fails. */
4297 :
4298 : static bool
4299 63371 : vect_analyze_slp_reduc_chain (loop_vec_info vinfo,
4300 : scalar_stmts_to_slp_tree_map_t *bst_map,
4301 : stmt_vec_info scalar_stmt,
4302 : unsigned max_tree_size, unsigned *limit)
4303 : {
4304 63371 : vec<stmt_vec_info> scalar_stmts = vNULL;
4305 :
4306 63371 : bool fail = false;
4307 : /* ??? We could leave operation code checking to SLP discovery. */
4308 63371 : code_helper code = STMT_VINFO_REDUC_CODE (STMT_VINFO_REDUC_DEF
4309 : (vect_orig_stmt (scalar_stmt)));
4310 63371 : bool first = true;
4311 63371 : stmt_vec_info next_stmt = scalar_stmt;
4312 71558 : do
4313 : {
4314 71558 : stmt_vec_info stmt = next_stmt;
4315 71558 : gimple_match_op op;
4316 71558 : if (!gimple_extract_op (STMT_VINFO_STMT (stmt), &op))
4317 0 : gcc_unreachable ();
4318 143116 : tree reduc_def = gimple_arg (STMT_VINFO_STMT (stmt),
4319 71558 : STMT_VINFO_REDUC_IDX (stmt));
4320 71558 : next_stmt = vect_stmt_to_vectorize (vinfo->lookup_def (reduc_def));
4321 71558 : gcc_assert (is_a <gphi *> (STMT_VINFO_STMT (next_stmt))
4322 : || STMT_VINFO_REDUC_IDX (next_stmt) != -1);
4323 77102 : if (!gimple_extract_op (STMT_VINFO_STMT (vect_orig_stmt (stmt)), &op))
4324 0 : gcc_unreachable ();
4325 71558 : if (CONVERT_EXPR_CODE_P (op.code)
4326 3421 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))
4327 74967 : && (first
4328 1692 : || is_a <gphi *> (STMT_VINFO_STMT (next_stmt))))
4329 : ;
4330 68153 : else if (code != op.code)
4331 : {
4332 2553 : fail = true;
4333 2553 : break;
4334 : }
4335 : else
4336 65600 : scalar_stmts.safe_push (stmt);
4337 69005 : first = false;
4338 : }
4339 69005 : while (!is_a <gphi *> (STMT_VINFO_STMT (next_stmt)));
4340 63371 : if (fail)
4341 2553 : return false;
4342 :
4343 : /* Remember a stmt with the actual reduction operation. */
4344 60818 : stmt_vec_info reduc_scalar_stmt = scalar_stmts[0];
4345 :
4346 : /* When the SSA def chain through reduc-idx does not form a natural
4347 : reduction chain try to linearize an associative operation manually. */
4348 60818 : if (scalar_stmts.length () == 1
4349 58199 : && code.is_tree_code ()
4350 52141 : && associative_tree_code ((tree_code)code)
4351 : /* We may not associate if a fold-left reduction is required. */
4352 112094 : && !needs_fold_left_reduction_p (TREE_TYPE (gimple_get_lhs
4353 : (reduc_scalar_stmt->stmt)),
4354 : code))
4355 : {
4356 49154 : auto_vec<chain_op_t> chain;
4357 49154 : auto_vec<std::pair<tree_code, gimple *> > worklist;
4358 49154 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
4359 49154 : if (is_a <gassign *> (scalar_stmts[0]->stmt)
4360 : /* We cannot linearize an operation that vect_slp_linearize_chain
4361 : would not put on its worklist. */
4362 49154 : && gimple_assign_rhs_code (scalar_stmts[0]->stmt) == (tree_code)code)
4363 : {
4364 48507 : vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
4365 48507 : scalar_stmts[0]->stmt, op_stmt,
4366 : other_op_stmt,
4367 : NULL);
4368 :
4369 48507 : scalar_stmts.truncate (0);
4370 48507 : stmt_vec_info tail = NULL;
4371 242780 : for (auto el : chain)
4372 : {
4373 97570 : if (el.dt == vect_external_def
4374 97570 : || el.dt == vect_constant_def
4375 97570 : || el.code != (tree_code) code)
4376 : {
4377 311 : scalar_stmts.release ();
4378 311 : return false;
4379 : }
4380 97259 : stmt_vec_info stmt = vinfo->lookup_def (el.op);
4381 97259 : if (STMT_VINFO_REDUC_IDX (stmt) != -1
4382 95715 : || STMT_VINFO_REDUC_DEF (stmt))
4383 : {
4384 48394 : gcc_assert (tail == NULL);
4385 48394 : tail = stmt;
4386 48394 : continue;
4387 : }
4388 48865 : scalar_stmts.safe_push (stmt);
4389 : }
4390 48196 : gcc_assert (tail);
4391 : }
4392 :
4393 : /* When this linearization didn't produce a chain see if stripping
4394 : a wrapping sign conversion produces one. */
4395 48843 : if (scalar_stmts.length () == 1
4396 48843 : && (code == PLUS_EXPR || code == MULT_EXPR || code == BIT_IOR_EXPR
4397 : || code == BIT_AND_EXPR || code == BIT_XOR_EXPR))
4398 : {
4399 47113 : gimple *stmt = scalar_stmts[0]->stmt;
4400 47113 : if (!is_gimple_assign (stmt)
4401 46057 : || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
4402 4498 : || TREE_CODE (gimple_assign_rhs1 (stmt)) != SSA_NAME
4403 51611 : || !tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4404 4498 : TREE_TYPE (gimple_assign_rhs1 (stmt))))
4405 : {
4406 45361 : scalar_stmts.release ();
4407 45361 : return false;
4408 : }
4409 1752 : stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
4410 1752 : if (!is_gimple_assign (stmt)
4411 1752 : || gimple_assign_rhs_code (stmt) != (tree_code)code)
4412 : {
4413 1733 : scalar_stmts.release ();
4414 1733 : return false;
4415 : }
4416 19 : chain.truncate (0);
4417 19 : vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
4418 : stmt, op_stmt, other_op_stmt, NULL);
4419 :
4420 19 : scalar_stmts.truncate (0);
4421 19 : stmt_vec_info tail = NULL;
4422 93 : for (auto el : chain)
4423 : {
4424 44 : if (el.dt == vect_external_def
4425 44 : || el.dt == vect_constant_def
4426 44 : || el.code != (tree_code) code)
4427 : {
4428 8 : scalar_stmts.release ();
4429 8 : return false;
4430 : }
4431 36 : stmt_vec_info stmt = vinfo->lookup_def (el.op);
4432 36 : if (STMT_VINFO_REDUC_IDX (stmt) != -1
4433 36 : || STMT_VINFO_REDUC_DEF (stmt))
4434 : {
4435 0 : gcc_assert (tail == NULL);
4436 0 : tail = stmt;
4437 0 : continue;
4438 : }
4439 36 : scalar_stmts.safe_push (stmt);
4440 : }
4441 : /* Unlike the above this does not include the reduction SSA
4442 : cycle. */
4443 11 : gcc_assert (!tail);
4444 : }
4445 :
4446 1741 : if (scalar_stmts.length () < 2)
4447 : {
4448 1622 : scalar_stmts.release ();
4449 1622 : return false;
4450 : }
4451 :
4452 119 : if (dump_enabled_p ())
4453 : {
4454 34 : dump_printf_loc (MSG_NOTE, vect_location,
4455 : "Starting SLP discovery of reduction chain for\n");
4456 140 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4457 212 : dump_printf_loc (MSG_NOTE, vect_location,
4458 106 : " %G", scalar_stmts[i]->stmt);
4459 : }
4460 :
4461 119 : unsigned int group_size = scalar_stmts.length ();
4462 119 : bool *matches = XALLOCAVEC (bool, group_size);
4463 119 : poly_uint64 max_nunits = 1;
4464 119 : unsigned tree_size = 0;
4465 119 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4466 : &max_nunits, matches, limit,
4467 119 : &tree_size, bst_map);
4468 119 : if (!node)
4469 : {
4470 47 : scalar_stmts.release ();
4471 47 : return false;
4472 : }
4473 :
4474 72 : unsigned cycle_id = vinfo->reduc_infos.length ();
4475 72 : vect_reduc_info reduc_info = new vect_reduc_info_s ();
4476 72 : vinfo->reduc_infos.safe_push (reduc_info);
4477 72 : VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (next_stmt);
4478 72 : VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (next_stmt);
4479 72 : VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (next_stmt);
4480 72 : VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
4481 72 : reduc_info->is_reduc_chain = true;
4482 :
4483 : /* Build the node for the PHI and possibly the conversions. */
4484 72 : slp_tree phis = vect_create_new_slp_node (2, ERROR_MARK);
4485 72 : SLP_TREE_REPRESENTATIVE (phis) = next_stmt;
4486 72 : phis->cycle_info.id = cycle_id;
4487 72 : SLP_TREE_LANES (phis) = group_size;
4488 72 : if (reduc_scalar_stmt == scalar_stmt)
4489 68 : SLP_TREE_VECTYPE (phis) = SLP_TREE_VECTYPE (node);
4490 : else
4491 4 : SLP_TREE_VECTYPE (phis)
4492 4 : = signed_or_unsigned_type_for (TYPE_UNSIGNED
4493 : (TREE_TYPE (gimple_get_lhs
4494 : (scalar_stmt->stmt))),
4495 : SLP_TREE_VECTYPE (node));
4496 : /* ??? vect_cse_slp_nodes cannot cope with cycles without any
4497 : SLP_TREE_SCALAR_STMTS. */
4498 72 : SLP_TREE_SCALAR_STMTS (phis).create (group_size);
4499 375 : for (unsigned i = 0; i < group_size; ++i)
4500 303 : SLP_TREE_SCALAR_STMTS (phis).quick_push (next_stmt);
4501 :
4502 72 : slp_tree op_input = phis;
4503 72 : if (reduc_scalar_stmt != scalar_stmt)
4504 : {
4505 4 : slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
4506 4 : SLP_TREE_REPRESENTATIVE (conv)
4507 4 : = vinfo->lookup_def (gimple_arg (reduc_scalar_stmt->stmt,
4508 4 : STMT_VINFO_REDUC_IDX
4509 : (reduc_scalar_stmt)));
4510 4 : SLP_TREE_CHILDREN (conv).quick_push (phis);
4511 4 : conv->cycle_info.id = cycle_id;
4512 4 : SLP_TREE_REDUC_IDX (conv) = 0;
4513 4 : SLP_TREE_LANES (conv) = group_size;
4514 4 : SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (node);
4515 4 : SLP_TREE_SCALAR_STMTS (conv) = vNULL;
4516 4 : op_input = conv;
4517 : }
4518 :
4519 72 : slp_tree reduc = vect_create_new_slp_node (2, ERROR_MARK);
4520 72 : SLP_TREE_REPRESENTATIVE (reduc) = reduc_scalar_stmt;
4521 72 : SLP_TREE_CHILDREN (reduc).quick_push (op_input);
4522 72 : SLP_TREE_CHILDREN (reduc).quick_push (node);
4523 72 : reduc->cycle_info.id = cycle_id;
4524 72 : SLP_TREE_REDUC_IDX (reduc) = 0;
4525 72 : SLP_TREE_LANES (reduc) = group_size;
4526 72 : SLP_TREE_VECTYPE (reduc) = SLP_TREE_VECTYPE (node);
4527 : /* ??? For the reduction epilogue we need a live lane. */
4528 72 : SLP_TREE_SCALAR_STMTS (reduc).create (group_size);
4529 72 : SLP_TREE_SCALAR_STMTS (reduc).quick_push (reduc_scalar_stmt);
4530 303 : for (unsigned i = 1; i < group_size; ++i)
4531 231 : SLP_TREE_SCALAR_STMTS (reduc).quick_push (NULL);
4532 :
4533 72 : if (reduc_scalar_stmt != scalar_stmt)
4534 : {
4535 4 : slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
4536 4 : SLP_TREE_REPRESENTATIVE (conv) = scalar_stmt;
4537 4 : SLP_TREE_CHILDREN (conv).quick_push (reduc);
4538 4 : conv->cycle_info.id = cycle_id;
4539 4 : SLP_TREE_REDUC_IDX (conv) = 0;
4540 4 : SLP_TREE_LANES (conv) = group_size;
4541 4 : SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (phis);
4542 : /* ??? For the reduction epilogue we need a live lane. */
4543 4 : SLP_TREE_SCALAR_STMTS (conv).create (group_size);
4544 4 : SLP_TREE_SCALAR_STMTS (conv).quick_push (scalar_stmt);
4545 8 : for (unsigned i = 1; i < group_size; ++i)
4546 4 : SLP_TREE_SCALAR_STMTS (conv).quick_push (NULL);
4547 4 : reduc = conv;
4548 : }
4549 :
4550 72 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (vinfo));
4551 72 : SLP_TREE_CHILDREN (phis).quick_push (NULL);
4552 72 : SLP_TREE_CHILDREN (phis).quick_push (NULL);
4553 72 : SLP_TREE_CHILDREN (phis)[le->dest_idx] = reduc;
4554 72 : SLP_TREE_REF_COUNT (reduc)++;
4555 :
4556 : /* Create a new SLP instance. */
4557 72 : slp_instance new_instance = XNEW (class _slp_instance);
4558 72 : SLP_INSTANCE_TREE (new_instance) = reduc;
4559 72 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4560 72 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4561 72 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4562 72 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
4563 72 : new_instance->reduc_phis = NULL;
4564 72 : new_instance->cost_vec = vNULL;
4565 72 : new_instance->subgraph_entries = vNULL;
4566 :
4567 72 : vinfo->slp_instances.safe_push (new_instance);
4568 :
4569 72 : if (dump_enabled_p ())
4570 : {
4571 24 : dump_printf_loc (MSG_NOTE, vect_location,
4572 : "Final SLP tree for instance %p:\n",
4573 : (void *) new_instance);
4574 24 : vect_print_slp_graph (MSG_NOTE, vect_location,
4575 : SLP_INSTANCE_TREE (new_instance));
4576 : }
4577 :
4578 72 : return true;
4579 49154 : }
4580 :
4581 11664 : if (scalar_stmts.length () <= 1)
4582 : {
4583 9045 : scalar_stmts.release ();
4584 9045 : return false;
4585 : }
4586 :
4587 2619 : scalar_stmts.reverse ();
4588 2619 : stmt_vec_info reduc_phi_info = next_stmt;
4589 :
4590 : /* Build the tree for the SLP instance. */
4591 2619 : vec<stmt_vec_info> root_stmt_infos = vNULL;
4592 2619 : vec<tree> remain = vNULL;
4593 :
4594 2619 : if (dump_enabled_p ())
4595 : {
4596 180 : dump_printf_loc (MSG_NOTE, vect_location,
4597 : "Starting SLP discovery of reduction chain for\n");
4598 966 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4599 1572 : dump_printf_loc (MSG_NOTE, vect_location,
4600 786 : " %G", scalar_stmts[i]->stmt);
4601 : }
4602 :
4603 : /* Build the tree for the SLP instance. */
4604 2619 : unsigned int group_size = scalar_stmts.length ();
4605 2619 : bool *matches = XALLOCAVEC (bool, group_size);
4606 2619 : poly_uint64 max_nunits = 1;
4607 2619 : unsigned tree_size = 0;
4608 :
4609 : /* ??? We need this only for SLP discovery. */
4610 10014 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4611 7395 : REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = scalar_stmts[0];
4612 :
4613 2619 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4614 : &max_nunits, matches, limit,
4615 2619 : &tree_size, bst_map);
4616 :
4617 10014 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4618 7395 : REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = NULL;
4619 :
4620 2619 : if (node != NULL)
4621 : {
4622 : /* Create a new SLP instance. */
4623 2286 : slp_instance new_instance = XNEW (class _slp_instance);
4624 2286 : SLP_INSTANCE_TREE (new_instance) = node;
4625 2286 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4626 2286 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4627 2286 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4628 2286 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
4629 2286 : new_instance->reduc_phis = NULL;
4630 2286 : new_instance->cost_vec = vNULL;
4631 2286 : new_instance->subgraph_entries = vNULL;
4632 :
4633 2286 : vect_reduc_info reduc_info = info_for_reduction (vinfo, node);
4634 2286 : reduc_info->is_reduc_chain = true;
4635 :
4636 2286 : if (dump_enabled_p ())
4637 135 : dump_printf_loc (MSG_NOTE, vect_location,
4638 : "SLP size %u vs. limit %u.\n",
4639 : tree_size, max_tree_size);
4640 :
4641 : /* Fixup SLP reduction chains. If this is a reduction chain with
4642 : a conversion in front amend the SLP tree with a node for that. */
4643 2286 : gimple *scalar_def = STMT_VINFO_REDUC_DEF (reduc_phi_info)->stmt;
4644 2286 : if (is_gimple_assign (scalar_def)
4645 2286 : && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (scalar_def)))
4646 : {
4647 43 : stmt_vec_info conv_info = vect_stmt_to_vectorize
4648 43 : (STMT_VINFO_REDUC_DEF (reduc_phi_info));
4649 43 : scalar_stmts = vNULL;
4650 43 : scalar_stmts.create (group_size);
4651 135 : for (unsigned i = 0; i < group_size; ++i)
4652 92 : scalar_stmts.quick_push (conv_info);
4653 43 : slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
4654 43 : SLP_TREE_VECTYPE (conv)
4655 43 : = get_vectype_for_scalar_type (vinfo,
4656 43 : TREE_TYPE
4657 : (gimple_assign_lhs (scalar_def)),
4658 : group_size);
4659 43 : SLP_TREE_REDUC_IDX (conv) = 0;
4660 43 : conv->cycle_info.id = node->cycle_info.id;
4661 43 : SLP_TREE_CHILDREN (conv).quick_push (node);
4662 43 : SLP_INSTANCE_TREE (new_instance) = conv;
4663 : }
4664 : /* Fill the backedge child of the PHI SLP node. The
4665 : general matching code cannot find it because the
4666 : scalar code does not reflect how we vectorize the
4667 : reduction. */
4668 2286 : use_operand_p use_p;
4669 2286 : imm_use_iterator imm_iter;
4670 2286 : class loop *loop = LOOP_VINFO_LOOP (vinfo);
4671 11023 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
4672 : gimple_get_lhs (scalar_def))
4673 : /* There are exactly two non-debug uses, the reduction
4674 : PHI and the loop-closed PHI node. */
4675 6451 : if (!is_gimple_debug (USE_STMT (use_p))
4676 6451 : && gimple_bb (USE_STMT (use_p)) == loop->header)
4677 : {
4678 2286 : auto_vec<stmt_vec_info, 64> phis (group_size);
4679 2286 : stmt_vec_info phi_info = vinfo->lookup_stmt (USE_STMT (use_p));
4680 8842 : for (unsigned i = 0; i < group_size; ++i)
4681 6556 : phis.quick_push (phi_info);
4682 2286 : slp_tree *phi_node = bst_map->get (phis);
4683 2286 : unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
4684 4572 : SLP_TREE_CHILDREN (*phi_node)[dest_idx]
4685 2286 : = SLP_INSTANCE_TREE (new_instance);
4686 2286 : SLP_INSTANCE_TREE (new_instance)->refcnt++;
4687 2286 : }
4688 :
4689 2286 : vinfo->slp_instances.safe_push (new_instance);
4690 :
4691 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4692 : the number of scalar stmts in the root in a few places.
4693 : Verify that assumption holds. */
4694 4572 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4695 : .length () == group_size);
4696 :
4697 2286 : if (dump_enabled_p ())
4698 : {
4699 135 : dump_printf_loc (MSG_NOTE, vect_location,
4700 : "Final SLP tree for instance %p:\n",
4701 : (void *) new_instance);
4702 135 : vect_print_slp_graph (MSG_NOTE, vect_location,
4703 : SLP_INSTANCE_TREE (new_instance));
4704 : }
4705 :
4706 2286 : return true;
4707 : }
4708 :
4709 : /* Failed to SLP. */
4710 333 : scalar_stmts.release ();
4711 333 : if (dump_enabled_p ())
4712 45 : dump_printf_loc (MSG_NOTE, vect_location,
4713 : "SLP discovery of reduction chain failed\n");
4714 : return false;
4715 : }
4716 :
4717 : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
4718 : of KIND. Return true if successful. */
4719 :
4720 : static bool
4721 89272 : vect_analyze_slp_reduction (loop_vec_info vinfo,
4722 : stmt_vec_info scalar_stmt,
4723 : unsigned max_tree_size, unsigned *limit,
4724 : scalar_stmts_to_slp_tree_map_t *bst_map,
4725 : bool force_single_lane)
4726 : {
4727 89272 : slp_instance_kind kind = slp_inst_kind_reduc_group;
4728 :
4729 : /* If there's no budget left bail out early. */
4730 89272 : if (*limit == 0)
4731 : return false;
4732 :
4733 : /* Try to gather a reduction chain. */
4734 89272 : if (! force_single_lane
4735 63641 : && STMT_VINFO_DEF_TYPE (scalar_stmt) == vect_reduction_def
4736 152643 : && vect_analyze_slp_reduc_chain (vinfo, bst_map, scalar_stmt,
4737 : max_tree_size, limit))
4738 : return true;
4739 :
4740 86914 : vec<stmt_vec_info> scalar_stmts;
4741 86914 : scalar_stmts.create (1);
4742 86914 : scalar_stmts.quick_push (scalar_stmt);
4743 :
4744 86914 : if (dump_enabled_p ())
4745 : {
4746 3483 : dump_printf_loc (MSG_NOTE, vect_location,
4747 : "Starting SLP discovery for\n");
4748 6966 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4749 6966 : dump_printf_loc (MSG_NOTE, vect_location,
4750 3483 : " %G", scalar_stmts[i]->stmt);
4751 : }
4752 :
4753 : /* Build the tree for the SLP instance. */
4754 86914 : unsigned int group_size = scalar_stmts.length ();
4755 86914 : bool *matches = XALLOCAVEC (bool, group_size);
4756 86914 : poly_uint64 max_nunits = 1;
4757 86914 : unsigned tree_size = 0;
4758 :
4759 86914 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4760 : &max_nunits, matches, limit,
4761 : &tree_size, bst_map);
4762 86914 : if (node != NULL)
4763 : {
4764 : /* Create a new SLP instance. */
4765 83906 : slp_instance new_instance = XNEW (class _slp_instance);
4766 83906 : SLP_INSTANCE_TREE (new_instance) = node;
4767 83906 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4768 83906 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4769 83906 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4770 83906 : SLP_INSTANCE_KIND (new_instance) = kind;
4771 83906 : new_instance->reduc_phis = NULL;
4772 83906 : new_instance->cost_vec = vNULL;
4773 83906 : new_instance->subgraph_entries = vNULL;
4774 :
4775 83906 : if (dump_enabled_p ())
4776 3363 : dump_printf_loc (MSG_NOTE, vect_location,
4777 : "SLP size %u vs. limit %u.\n",
4778 : tree_size, max_tree_size);
4779 :
4780 83906 : vinfo->slp_instances.safe_push (new_instance);
4781 :
4782 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4783 : the number of scalar stmts in the root in a few places.
4784 : Verify that assumption holds. */
4785 167812 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4786 : .length () == group_size);
4787 :
4788 83906 : if (dump_enabled_p ())
4789 : {
4790 3363 : dump_printf_loc (MSG_NOTE, vect_location,
4791 : "Final SLP tree for instance %p:\n",
4792 : (void *) new_instance);
4793 3363 : vect_print_slp_graph (MSG_NOTE, vect_location,
4794 : SLP_INSTANCE_TREE (new_instance));
4795 : }
4796 :
4797 83906 : return true;
4798 : }
4799 : /* Failed to SLP. */
4800 :
4801 : /* Free the allocated memory. */
4802 3008 : scalar_stmts.release ();
4803 :
4804 : /* Failed to SLP. */
4805 3008 : if (dump_enabled_p ())
4806 120 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4807 : return false;
4808 : }
4809 :
4810 : /* Analyze a single SLP reduction group. If successful add a SLP instance
4811 : for it and return true, otherwise return false and have *MATCHES
4812 : populated. */
4813 :
4814 : static bool
4815 26943 : vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
4816 : vec<stmt_vec_info> scalar_stmts,
4817 : scalar_stmts_to_slp_tree_map_t *bst_map,
4818 : unsigned max_tree_size, unsigned *limit,
4819 : bool *matches)
4820 : {
4821 : /* Try to form a reduction group. */
4822 26943 : unsigned int group_size = scalar_stmts.length ();
4823 26943 : if (!matches)
4824 11199 : matches = XALLOCAVEC (bool, group_size);
4825 26943 : poly_uint64 max_nunits = 1;
4826 26943 : unsigned tree_size = 0;
4827 26943 : slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
4828 : group_size,
4829 : &max_nunits, matches, limit,
4830 : &tree_size, bst_map);
4831 26943 : if (!node)
4832 : return false;
4833 :
4834 : /* Create a new SLP instance. */
4835 12237 : slp_instance new_instance = XNEW (class _slp_instance);
4836 12237 : SLP_INSTANCE_TREE (new_instance) = node;
4837 12237 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4838 12237 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4839 12237 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4840 12237 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
4841 12237 : new_instance->reduc_phis = NULL;
4842 12237 : new_instance->cost_vec = vNULL;
4843 12237 : new_instance->subgraph_entries = vNULL;
4844 :
4845 12237 : if (dump_enabled_p ())
4846 571 : dump_printf_loc (MSG_NOTE, vect_location,
4847 : "SLP size %u vs. limit %u.\n",
4848 : tree_size, max_tree_size);
4849 :
4850 12237 : loop_vinfo->slp_instances.safe_push (new_instance);
4851 :
4852 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4853 : the number of scalar stmts in the root in a few places.
4854 : Verify that assumption holds. */
4855 24474 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4856 : .length () == group_size);
4857 :
4858 12237 : if (dump_enabled_p ())
4859 : {
4860 571 : dump_printf_loc (MSG_NOTE, vect_location,
4861 : "SLP discovery of size %d reduction group "
4862 : "succeeded\n", group_size);
4863 571 : dump_printf_loc (MSG_NOTE, vect_location,
4864 : "Final SLP tree for instance %p:\n",
4865 : (void *) new_instance);
4866 571 : vect_print_slp_graph (MSG_NOTE, vect_location,
4867 : SLP_INSTANCE_TREE (new_instance));
4868 : }
4869 :
4870 : return true;
4871 : }
4872 :
4873 : /* Analyze reductions in LOOP_VINFO and populate SLP instances
4874 : accordingly. Returns false if something fails. */
4875 :
4876 : static bool
4877 488479 : vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
4878 : unsigned max_tree_size, unsigned *limit,
4879 : scalar_stmts_to_slp_tree_map_t *bst_map,
4880 : bool force_single_lane)
4881 : {
4882 554153 : if (loop_vinfo->reductions.is_empty ())
4883 : return true;
4884 :
4885 : /* Collect reduction statements we can combine into
4886 : a SLP reduction. */
4887 73074 : vec<stmt_vec_info> scalar_stmts;
4888 73074 : scalar_stmts.create (loop_vinfo->reductions.length ());
4889 324316 : for (auto next_info : loop_vinfo->reductions)
4890 : {
4891 105094 : next_info = vect_stmt_to_vectorize (next_info);
4892 105094 : if ((STMT_VINFO_RELEVANT_P (next_info)
4893 14 : || STMT_VINFO_LIVE_P (next_info))
4894 : /* ??? Make sure we didn't skip a conversion around a
4895 : reduction path. In that case we'd have to reverse
4896 : engineer that conversion stmt following the chain using
4897 : reduc_idx and from the PHI using reduc_def. */
4898 105080 : && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
4899 105080 : || (STMT_VINFO_DEF_TYPE (next_info)
4900 : == vect_double_reduction_def)))
4901 : {
4902 : /* Do not discover SLP reductions combining lane-reducing
4903 : ops, that will fail later. */
4904 105080 : if (!force_single_lane
4905 105080 : && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
4906 78760 : scalar_stmts.quick_push (next_info);
4907 : /* Do SLP discovery for single-lane reductions. */
4908 26320 : else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
4909 : max_tree_size, limit,
4910 : bst_map,
4911 : force_single_lane))
4912 : {
4913 0 : scalar_stmts.release ();
4914 0 : return false;
4915 : }
4916 : }
4917 : }
4918 :
4919 73074 : if (scalar_stmts.length () > 1)
4920 : {
4921 : /* Try to form a reduction group. */
4922 4570 : unsigned int group_size = scalar_stmts.length ();
4923 4570 : bool *matches = XALLOCAVEC (bool, group_size);
4924 4570 : if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map,
4925 : max_tree_size, limit, matches))
4926 4417 : return true;
4927 :
4928 : /* When analysis as a single SLP reduction group failed try to
4929 : form sub-groups by collecting matching lanes. Do not recurse
4930 : that on failure (to limit compile-time costs), but recurse
4931 : for the initial non-matching parts. Everything not covered
4932 : by a sub-group gets single-reduction treatment. */
4933 3494 : vec<stmt_vec_info> cands = vNULL;
4934 11352 : while (matches[0])
4935 : {
4936 11199 : cands.truncate (0);
4937 11199 : cands.reserve (group_size, true);
4938 88243 : for (unsigned i = 0; i < group_size; ++i)
4939 77044 : if (matches[i])
4940 19532 : cands.quick_push (scalar_stmts[i]);
4941 :
4942 : /* Try to form a reduction group. */
4943 11199 : if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
4944 : max_tree_size, limit, NULL))
4945 7845 : cands = vNULL;
4946 : else
4947 : {
4948 : /* Do SLP discovery for single-lane reductions. */
4949 20489 : for (auto stmt_info : cands)
4950 10452 : if (! vect_analyze_slp_reduction (loop_vinfo,
4951 : vect_stmt_to_vectorize
4952 : (stmt_info),
4953 : max_tree_size, limit,
4954 : bst_map, force_single_lane))
4955 : {
4956 25 : scalar_stmts.release ();
4957 25 : cands.release ();
4958 25 : return false;
4959 : }
4960 : }
4961 : /* Remove the handled stmts from scalar_stmts and try again,
4962 : possibly repeating the above with updated matches[]. */
4963 : unsigned j = 0;
4964 88148 : for (unsigned i = 0; i < group_size; ++i)
4965 76974 : if (!matches[i])
4966 : {
4967 57482 : scalar_stmts[j] = scalar_stmts[i];
4968 57482 : ++j;
4969 : }
4970 11174 : scalar_stmts.truncate (j);
4971 11174 : group_size = scalar_stmts.length ();
4972 11174 : if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
4973 : bst_map, max_tree_size, limit,
4974 : matches))
4975 : return true;
4976 : }
4977 : }
4978 : /* Do SLP discovery for single-lane reductions. */
4979 255488 : for (auto stmt_info : scalar_stmts)
4980 52500 : if (! vect_analyze_slp_reduction (loop_vinfo,
4981 : vect_stmt_to_vectorize (stmt_info),
4982 : max_tree_size, limit,
4983 : bst_map, force_single_lane))
4984 : {
4985 2983 : scalar_stmts.release ();
4986 2983 : return false;
4987 : }
4988 :
4989 65674 : scalar_stmts.release ();
4990 65674 : return true;
4991 : }
4992 :
4993 : /* Analyze an SLP instance starting from a group of grouped stores. Call
4994 : vect_build_slp_tree to build a tree of packed stmts if possible.
4995 : Return FALSE if it's impossible to SLP any stmt in the group. */
4996 :
4997 : static bool
4998 1089839 : vect_analyze_slp_instance (vec_info *vinfo,
4999 : scalar_stmts_to_slp_tree_map_t *bst_map,
5000 : stmt_vec_info stmt_info,
5001 : slp_instance_kind kind,
5002 : unsigned max_tree_size, unsigned *limit,
5003 : bool force_single_lane)
5004 : {
5005 1089839 : vec<stmt_vec_info> scalar_stmts;
5006 :
5007 1089839 : if (is_a <bb_vec_info> (vinfo))
5008 1060690 : vect_location = stmt_info->stmt;
5009 :
5010 1089839 : gcc_assert (kind == slp_inst_kind_store);
5011 :
5012 : /* Collect the stores and store them in scalar_stmts. */
5013 1089839 : scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
5014 1089839 : stmt_vec_info next_info = stmt_info;
5015 5419690 : while (next_info)
5016 : {
5017 3240012 : scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
5018 3240012 : next_info = DR_GROUP_NEXT_ELEMENT (next_info);
5019 : }
5020 :
5021 1089839 : vec<stmt_vec_info> root_stmt_infos = vNULL;
5022 1089839 : vec<tree> remain = vNULL;
5023 :
5024 : /* Build the tree for the SLP instance. */
5025 :
5026 : /* If there's no budget left bail out early. */
5027 1089839 : if (*limit == 0)
5028 : return false;
5029 :
5030 1089816 : if (dump_enabled_p ())
5031 : {
5032 4132 : dump_printf_loc (MSG_NOTE, vect_location,
5033 : "Starting SLP discovery for\n");
5034 23834 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
5035 39404 : dump_printf_loc (MSG_NOTE, vect_location,
5036 19702 : " %G", scalar_stmts[i]->stmt);
5037 : }
5038 :
5039 : /* Build the tree for the SLP instance. */
5040 1089816 : unsigned int group_size = scalar_stmts.length ();
5041 1089816 : bool *matches = XALLOCAVEC (bool, group_size);
5042 1089816 : poly_uint64 max_nunits = 1;
5043 1089816 : unsigned tree_size = 0;
5044 1089816 : unsigned i;
5045 :
5046 1089816 : slp_tree node = NULL;
5047 1089816 : if (group_size > 1 && force_single_lane)
5048 : {
5049 1690 : matches[0] = true;
5050 1690 : matches[1] = false;
5051 : }
5052 : else
5053 1088126 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
5054 : &max_nunits, matches, limit,
5055 : &tree_size, bst_map);
5056 1089816 : if (node != NULL)
5057 : {
5058 : /* Calculate the unrolling factor based on the smallest type. */
5059 678468 : poly_uint64 unrolling_factor
5060 678468 : = calculate_unrolling_factor (max_nunits, group_size);
5061 :
5062 678468 : if (maybe_ne (unrolling_factor, 1U)
5063 678468 : && is_a <bb_vec_info> (vinfo))
5064 : {
5065 0 : unsigned HOST_WIDE_INT const_max_nunits;
5066 0 : if (!max_nunits.is_constant (&const_max_nunits)
5067 0 : || const_max_nunits > group_size)
5068 : {
5069 0 : if (dump_enabled_p ())
5070 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5071 : "Build SLP failed: store group "
5072 : "size not a multiple of the vector size "
5073 : "in basic block SLP\n");
5074 0 : vect_free_slp_tree (node);
5075 0 : return false;
5076 : }
5077 : /* Fatal mismatch. */
5078 0 : if (dump_enabled_p ())
5079 0 : dump_printf_loc (MSG_NOTE, vect_location,
5080 : "SLP discovery succeeded but node needs "
5081 : "splitting\n");
5082 0 : memset (matches, true, group_size);
5083 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
5084 0 : vect_free_slp_tree (node);
5085 : }
5086 : else
5087 : {
5088 : /* Create a new SLP instance. */
5089 678468 : slp_instance new_instance = XNEW (class _slp_instance);
5090 678468 : SLP_INSTANCE_TREE (new_instance) = node;
5091 678468 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
5092 678468 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
5093 678468 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
5094 678468 : SLP_INSTANCE_KIND (new_instance) = kind;
5095 678468 : new_instance->reduc_phis = NULL;
5096 678468 : new_instance->cost_vec = vNULL;
5097 678468 : new_instance->subgraph_entries = vNULL;
5098 :
5099 678468 : if (dump_enabled_p ())
5100 3148 : dump_printf_loc (MSG_NOTE, vect_location,
5101 : "SLP size %u vs. limit %u.\n",
5102 : tree_size, max_tree_size);
5103 :
5104 678468 : vinfo->slp_instances.safe_push (new_instance);
5105 :
5106 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
5107 : the number of scalar stmts in the root in a few places.
5108 : Verify that assumption holds. */
5109 1356936 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
5110 : .length () == group_size);
5111 :
5112 678468 : if (dump_enabled_p ())
5113 : {
5114 3148 : dump_printf_loc (MSG_NOTE, vect_location,
5115 : "Final SLP tree for instance %p:\n",
5116 : (void *) new_instance);
5117 3148 : vect_print_slp_graph (MSG_NOTE, vect_location,
5118 : SLP_INSTANCE_TREE (new_instance));
5119 : }
5120 :
5121 678468 : return true;
5122 : }
5123 : }
5124 : /* Failed to SLP. */
5125 :
5126 : /* Try to break the group up into pieces. */
5127 411348 : if (*limit > 0 && kind == slp_inst_kind_store)
5128 : {
5129 : /* ??? We could delay all the actual splitting of store-groups
5130 : until after SLP discovery of the original group completed.
5131 : Then we can recurse to vect_build_slp_instance directly. */
5132 1076566 : for (i = 0; i < group_size; i++)
5133 1076566 : if (!matches[i])
5134 : break;
5135 :
5136 : /* For basic block SLP, try to break the group up into multiples of
5137 : a vector size. */
5138 411347 : if (is_a <bb_vec_info> (vinfo)
5139 411347 : && (i > 1 && i < group_size))
5140 : {
5141 : /* Free the allocated memory. */
5142 153652 : scalar_stmts.release ();
5143 :
5144 153652 : tree scalar_type
5145 153652 : = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
5146 307304 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
5147 153652 : 1 << floor_log2 (i));
5148 153652 : unsigned HOST_WIDE_INT const_nunits;
5149 153652 : if (vectype
5150 153652 : && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
5151 : {
5152 : /* Split into two groups at the first vector boundary. */
5153 153652 : gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
5154 153652 : unsigned group1_size = i & ~(const_nunits - 1);
5155 :
5156 153652 : if (dump_enabled_p ())
5157 59 : dump_printf_loc (MSG_NOTE, vect_location,
5158 : "Splitting SLP group at stmt %u\n", i);
5159 153652 : stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
5160 : group1_size);
5161 153652 : bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
5162 : kind, max_tree_size,
5163 : limit, false);
5164 : /* Split the rest at the failure point and possibly
5165 : re-analyze the remaining matching part if it has
5166 : at least two lanes. */
5167 153652 : if (group1_size < i
5168 5271 : && (i + 1 < group_size
5169 2894 : || i - group1_size > 1))
5170 : {
5171 2409 : stmt_vec_info rest2 = rest;
5172 2409 : rest = vect_split_slp_store_group (rest, i - group1_size);
5173 2409 : if (i - group1_size > 1)
5174 61 : res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
5175 : kind, max_tree_size,
5176 : limit, false);
5177 : }
5178 : /* Re-analyze the non-matching tail if it has at least
5179 : two lanes. */
5180 153652 : if (i + 1 < group_size)
5181 21817 : res |= vect_analyze_slp_instance (vinfo, bst_map,
5182 : rest, kind, max_tree_size,
5183 : limit, false);
5184 153652 : return res;
5185 : }
5186 : }
5187 :
5188 : /* For loop vectorization split the RHS into arbitrary pieces of
5189 : size >= 1. */
5190 257695 : else if (is_a <loop_vec_info> (vinfo)
5191 257695 : && (group_size != 1 && i < group_size))
5192 : {
5193 7973 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
5194 28 : bool masked_p = call
5195 28 : && gimple_call_internal_p (call)
5196 28 : && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
5197 : /* There are targets that cannot do even/odd interleaving schemes
5198 : so they absolutely need to use load/store-lanes. For now
5199 : force single-lane SLP for them - they would be happy with
5200 : uniform power-of-two lanes (but depending on element size),
5201 : but even if we can use 'i' as indicator we would need to
5202 : backtrack when later lanes fail to discover with the same
5203 : granularity. We cannot turn any of strided or scatter store
5204 : into store-lanes. */
5205 : /* ??? If this is not in sync with what get_load_store_type
5206 : later decides the SLP representation is not good for other
5207 : store vectorization methods. */
5208 7973 : bool want_store_lanes
5209 7973 : = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5210 7973 : && ! STMT_VINFO_STRIDED_P (stmt_info)
5211 5896 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
5212 5892 : && compare_step_with_zero (vinfo, stmt_info) > 0
5213 13785 : && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
5214 15946 : masked_p, group_size, i));
5215 7973 : if (want_store_lanes || force_single_lane)
5216 : i = 1;
5217 :
5218 : /* A fatal discovery fail doesn't always mean single-lane SLP
5219 : isn't a possibility, so try. */
5220 6283 : if (i == 0)
5221 : i = 1;
5222 :
5223 7973 : if (dump_enabled_p ())
5224 883 : dump_printf_loc (MSG_NOTE, vect_location,
5225 : "Splitting SLP group at stmt %u\n", i);
5226 :
5227 : /* Analyze the stored values and pinch them together with
5228 : a permute node so we can preserve the whole store group. */
5229 7973 : auto_vec<slp_tree> rhs_nodes;
5230 7973 : poly_uint64 max_nunits = 1;
5231 :
5232 7973 : unsigned int rhs_common_nlanes = 0;
5233 7973 : unsigned int start = 0, end = i;
5234 36019 : while (start < group_size)
5235 : {
5236 28307 : gcc_assert (end - start >= 1);
5237 28307 : vec<stmt_vec_info> substmts;
5238 28307 : substmts.create (end - start);
5239 88748 : for (unsigned j = start; j < end; ++j)
5240 60441 : substmts.quick_push (scalar_stmts[j]);
5241 28307 : max_nunits = 1;
5242 28307 : node = vect_build_slp_tree (vinfo, substmts, end - start,
5243 : &max_nunits,
5244 : matches, limit, &tree_size, bst_map);
5245 28307 : if (node)
5246 : {
5247 22518 : rhs_nodes.safe_push (node);
5248 22518 : vect_update_max_nunits (&max_nunits, node->max_nunits);
5249 22518 : if (start == 0)
5250 7718 : rhs_common_nlanes = SLP_TREE_LANES (node);
5251 14800 : else if (rhs_common_nlanes != SLP_TREE_LANES (node))
5252 1375 : rhs_common_nlanes = 0;
5253 22518 : start = end;
5254 22518 : if (want_store_lanes || force_single_lane)
5255 5087 : end = start + 1;
5256 : else
5257 : end = group_size;
5258 : }
5259 : else
5260 : {
5261 5789 : substmts.release ();
5262 5789 : if (end - start == 1)
5263 : {
5264 : /* Single-lane discovery failed. Free ressources. */
5265 281 : for (auto node : rhs_nodes)
5266 8 : vect_free_slp_tree (node);
5267 261 : scalar_stmts.release ();
5268 261 : if (dump_enabled_p ())
5269 39 : dump_printf_loc (MSG_NOTE, vect_location,
5270 : "SLP discovery failed\n");
5271 261 : return false;
5272 : }
5273 :
5274 : /* ??? It really happens that we soft-fail SLP
5275 : build at a mismatch but the matching part hard-fails
5276 : later. As we know we arrived here with a group
5277 : larger than one try a group of size one! */
5278 5528 : if (!matches[0])
5279 44 : end = start + 1;
5280 : else
5281 12067 : for (unsigned j = start; j < end; j++)
5282 12067 : if (!matches[j - start])
5283 : {
5284 : end = j;
5285 : break;
5286 : }
5287 : }
5288 : }
5289 :
5290 : /* Now re-assess whether we want store lanes in case the
5291 : discovery ended up producing all single-lane RHSs. */
5292 7712 : if (! want_store_lanes
5293 7712 : && rhs_common_nlanes == 1
5294 6655 : && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5295 6655 : && ! STMT_VINFO_STRIDED_P (stmt_info)
5296 4952 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
5297 4949 : && compare_step_with_zero (vinfo, stmt_info) > 0
5298 12606 : && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
5299 : group_size, masked_p)
5300 : != IFN_LAST))
5301 : want_store_lanes = true;
5302 :
5303 : /* Now we assume we can build the root SLP node from all stores. */
5304 7712 : if (want_store_lanes)
5305 : {
5306 : /* For store-lanes feed the store node with all RHS nodes
5307 : in order. */
5308 0 : node = vect_create_new_slp_node (scalar_stmts,
5309 0 : SLP_TREE_CHILDREN
5310 : (rhs_nodes[0]).length ());
5311 0 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
5312 0 : node->max_nunits = max_nunits;
5313 0 : node->ldst_lanes = true;
5314 0 : SLP_TREE_CHILDREN (node)
5315 0 : .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
5316 0 : + rhs_nodes.length () - 1);
5317 : /* First store value and possibly mask. */
5318 0 : SLP_TREE_CHILDREN (node)
5319 0 : .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
5320 : /* Rest of the store values. All mask nodes are the same,
5321 : this should be guaranteed by dataref group discovery. */
5322 0 : for (unsigned j = 1; j < rhs_nodes.length (); ++j)
5323 0 : SLP_TREE_CHILDREN (node)
5324 0 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
5325 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
5326 0 : child->refcnt++;
5327 : }
5328 : else
5329 7712 : node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
5330 : max_nunits);
5331 :
5332 30222 : while (!rhs_nodes.is_empty ())
5333 22510 : vect_free_slp_tree (rhs_nodes.pop ());
5334 :
5335 : /* Create a new SLP instance. */
5336 7712 : slp_instance new_instance = XNEW (class _slp_instance);
5337 7712 : SLP_INSTANCE_TREE (new_instance) = node;
5338 7712 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
5339 7712 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
5340 7712 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
5341 7712 : SLP_INSTANCE_KIND (new_instance) = kind;
5342 7712 : new_instance->reduc_phis = NULL;
5343 7712 : new_instance->cost_vec = vNULL;
5344 7712 : new_instance->subgraph_entries = vNULL;
5345 :
5346 7712 : if (dump_enabled_p ())
5347 844 : dump_printf_loc (MSG_NOTE, vect_location,
5348 : "SLP size %u vs. limit %u.\n",
5349 : tree_size, max_tree_size);
5350 :
5351 7712 : vinfo->slp_instances.safe_push (new_instance);
5352 :
5353 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
5354 : the number of scalar stmts in the root in a few places.
5355 : Verify that assumption holds. */
5356 15424 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
5357 : .length () == group_size);
5358 :
5359 7712 : if (dump_enabled_p ())
5360 : {
5361 844 : dump_printf_loc (MSG_NOTE, vect_location,
5362 : "Final SLP tree for instance %p:\n",
5363 : (void *) new_instance);
5364 844 : vect_print_slp_graph (MSG_NOTE, vect_location,
5365 : SLP_INSTANCE_TREE (new_instance));
5366 : }
5367 7712 : return true;
5368 7973 : }
5369 : else
5370 : /* Free the allocated memory. */
5371 249722 : scalar_stmts.release ();
5372 :
5373 : /* Even though the first vector did not all match, we might be able to SLP
5374 : (some) of the remainder. FORNOW ignore this possibility. */
5375 : }
5376 : else
5377 : /* Free the allocated memory. */
5378 1 : scalar_stmts.release ();
5379 :
5380 : /* Failed to SLP. */
5381 249723 : if (dump_enabled_p ())
5382 42 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
5383 : return false;
5384 : }
5385 :
5386 : /* qsort comparator ordering SLP load nodes. */
5387 :
5388 : static int
5389 2634002 : vllp_cmp (const void *a_, const void *b_)
5390 : {
5391 2634002 : const slp_tree a = *(const slp_tree *)a_;
5392 2634002 : const slp_tree b = *(const slp_tree *)b_;
5393 2634002 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
5394 2634002 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
5395 2634002 : if (STMT_VINFO_GROUPED_ACCESS (a0)
5396 1536055 : && STMT_VINFO_GROUPED_ACCESS (b0)
5397 4108635 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
5398 : {
5399 : /* Same group, order after lanes used. */
5400 343047 : if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
5401 : return 1;
5402 334264 : else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
5403 : return -1;
5404 : else
5405 : {
5406 : /* Try to order loads using the same lanes together, breaking
5407 : the tie with the lane number that first differs. */
5408 324730 : if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
5409 324730 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
5410 : return 0;
5411 324730 : else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
5412 324730 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
5413 : return 1;
5414 320687 : else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
5415 320687 : && SLP_TREE_LOAD_PERMUTATION (b).exists ())
5416 : return -1;
5417 : else
5418 : {
5419 313301 : for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
5420 313301 : if (SLP_TREE_LOAD_PERMUTATION (a)[i]
5421 313301 : != SLP_TREE_LOAD_PERMUTATION (b)[i])
5422 : {
5423 : /* In-order lane first, that's what the above case for
5424 : no permutation does. */
5425 311989 : if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
5426 : return -1;
5427 191521 : else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
5428 : return 1;
5429 100787 : else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
5430 100787 : < SLP_TREE_LOAD_PERMUTATION (b)[i])
5431 : return -1;
5432 : else
5433 : return 1;
5434 : }
5435 : return 0;
5436 : }
5437 : }
5438 : }
5439 : else /* Different groups or non-groups. */
5440 : {
5441 : /* Order groups as their first element to keep them together. */
5442 2290955 : if (STMT_VINFO_GROUPED_ACCESS (a0))
5443 2290955 : a0 = DR_GROUP_FIRST_ELEMENT (a0);
5444 2290955 : if (STMT_VINFO_GROUPED_ACCESS (b0))
5445 2290955 : b0 = DR_GROUP_FIRST_ELEMENT (b0);
5446 2290955 : if (a0 == b0)
5447 : return 0;
5448 : /* Tie using UID. */
5449 2290835 : else if (gimple_uid (STMT_VINFO_STMT (a0))
5450 2290835 : < gimple_uid (STMT_VINFO_STMT (b0)))
5451 : return -1;
5452 : else
5453 : {
5454 1017532 : gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
5455 : != gimple_uid (STMT_VINFO_STMT (b0)));
5456 : return 1;
5457 : }
5458 : }
5459 : }
5460 :
5461 : /* Return whether if the load permutation of NODE is consecutive starting
5462 : with value START_VAL in the first element. If START_VAL is not given
5463 : the first element's value is used. */
5464 :
5465 : bool
5466 619295 : vect_load_perm_consecutive_p (slp_tree node, unsigned start_val)
5467 : {
5468 619295 : load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
5469 :
5470 619295 : if (!perm.exists () || !perm.length ())
5471 : return false;
5472 :
5473 619295 : if (start_val == UINT_MAX)
5474 79156 : start_val = perm[0];
5475 :
5476 1222836 : for (unsigned int i = 0; i < perm.length (); i++)
5477 626572 : if (perm[i] != start_val + (unsigned int) i)
5478 : return false;
5479 :
5480 : return true;
5481 : }
5482 :
5483 : /* Process the set of LOADS that are all from the same dataref group. */
5484 :
5485 : static void
5486 160485 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
5487 : scalar_stmts_to_slp_tree_map_t *bst_map,
5488 : const array_slice<slp_tree> &loads,
5489 : bool force_single_lane)
5490 : {
5491 : /* We at this point want to lower without a fixed VF or vector
5492 : size in mind which means we cannot actually compute whether we
5493 : need three or more vectors for a load permutation yet. So always
5494 : lower. */
5495 160485 : stmt_vec_info first
5496 160485 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
5497 160485 : unsigned group_lanes = DR_GROUP_SIZE (first);
5498 :
5499 : /* Verify if all load permutations can be implemented with a suitably
5500 : large element load-lanes operation. */
5501 160485 : unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
5502 160485 : if (STMT_VINFO_STRIDED_P (first)
5503 158192 : || compare_step_with_zero (loop_vinfo, first) <= 0
5504 155847 : || exact_log2 (ld_lanes_lanes) == -1
5505 : /* ??? For now only support the single-lane case as there is
5506 : missing support on the store-lane side and code generation
5507 : isn't up to the task yet. */
5508 153068 : || ld_lanes_lanes != 1
5509 302575 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
5510 : group_lanes / ld_lanes_lanes,
5511 : false) == IFN_LAST)
5512 : ld_lanes_lanes = 0;
5513 : else
5514 : /* Verify the loads access the same number of lanes aligned to
5515 : ld_lanes_lanes. */
5516 0 : for (slp_tree load : loads)
5517 : {
5518 0 : if (SLP_TREE_LANES (load) != ld_lanes_lanes)
5519 : {
5520 : ld_lanes_lanes = 0;
5521 : break;
5522 : }
5523 0 : unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
5524 0 : if (first % ld_lanes_lanes != 0)
5525 : {
5526 : ld_lanes_lanes = 0;
5527 : break;
5528 : }
5529 0 : if (!vect_load_perm_consecutive_p (load))
5530 : {
5531 : ld_lanes_lanes = 0;
5532 : break;
5533 : }
5534 : }
5535 :
5536 : /* Only a power-of-two number of lanes matches interleaving with N levels.
5537 : ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
5538 : at each step. */
5539 261205 : if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
5540 : return;
5541 :
5542 262947 : for (slp_tree load : loads)
5543 : {
5544 : /* Leave masked or gather loads alone for now. */
5545 185666 : if (!SLP_TREE_CHILDREN (load).is_empty ())
5546 59356 : continue;
5547 :
5548 : /* For single-element interleaving spanning multiple vectors avoid
5549 : lowering, we want to use VMAT_ELEMENTWISE later. */
5550 185660 : if (ld_lanes_lanes == 0
5551 185660 : && SLP_TREE_LANES (load) == 1
5552 166410 : && !DR_GROUP_NEXT_ELEMENT (first)
5553 264761 : && maybe_gt (group_lanes,
5554 : TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (load))))
5555 51086 : return;
5556 :
5557 : /* We want to pattern-match special cases here and keep those
5558 : alone. Candidates are splats and load-lane. */
5559 :
5560 : /* We need to lower only loads of less than half of the groups
5561 : lanes, including duplicate lanes. Note this leaves nodes
5562 : with a non-1:1 load permutation around instead of canonicalizing
5563 : those into a load and a permute node. Removing this early
5564 : check would do such canonicalization. */
5565 134574 : if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
5566 55846 : && ld_lanes_lanes == 0)
5567 55846 : continue;
5568 :
5569 : /* Build the permute to get the original load permutation order. */
5570 78728 : bool contiguous = vect_load_perm_consecutive_p (load);
5571 78728 : lane_permutation_t final_perm;
5572 78728 : final_perm.create (SLP_TREE_LANES (load));
5573 158370 : for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
5574 159284 : final_perm.quick_push (
5575 79642 : std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
5576 :
5577 : /* When the load permutation accesses a contiguous unpermuted,
5578 : power-of-two aligned and sized chunk leave the load alone.
5579 : We can likely (re-)load it more efficiently rather than
5580 : extracting it from the larger load.
5581 : ??? Long-term some of the lowering should move to where
5582 : the vector types involved are fixed. */
5583 82232 : if (!force_single_lane
5584 78728 : && ld_lanes_lanes == 0
5585 53059 : && contiguous
5586 52816 : && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
5587 6499 : && pow2p_hwi (SLP_TREE_LANES (load))
5588 6463 : && pow2p_hwi (group_lanes)
5589 3504 : && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
5590 82232 : && group_lanes % SLP_TREE_LANES (load) == 0)
5591 : {
5592 3504 : final_perm.release ();
5593 3504 : continue;
5594 : }
5595 :
5596 : /* First build (and possibly re-use) a load node for the
5597 : unpermuted group. Gaps in the middle and on the end are
5598 : represented with NULL stmts. */
5599 75224 : vec<stmt_vec_info> stmts;
5600 75224 : stmts.create (group_lanes);
5601 267329 : for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
5602 : {
5603 192105 : if (s != first)
5604 121634 : for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
5605 4753 : stmts.quick_push (NULL);
5606 192105 : stmts.quick_push (s);
5607 : }
5608 137170 : for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
5609 61946 : stmts.quick_push (NULL);
5610 75224 : poly_uint64 max_nunits = 1;
5611 75224 : bool *matches = XALLOCAVEC (bool, group_lanes);
5612 75224 : unsigned limit = 1;
5613 75224 : unsigned tree_size = 0;
5614 75224 : slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
5615 : group_lanes,
5616 : &max_nunits, matches, &limit,
5617 75224 : &tree_size, bst_map);
5618 75224 : gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
5619 :
5620 75224 : if (ld_lanes_lanes != 0)
5621 : {
5622 : /* ??? If this is not in sync with what get_load_store_type
5623 : later decides the SLP representation is not good for other
5624 : store vectorization methods. */
5625 0 : l0->ldst_lanes = true;
5626 0 : load->ldst_lanes = true;
5627 : }
5628 :
5629 233338 : while (1)
5630 : {
5631 154281 : unsigned group_lanes = SLP_TREE_LANES (l0);
5632 154281 : if (ld_lanes_lanes != 0
5633 154281 : || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
5634 : break;
5635 :
5636 : /* Try to lower by reducing the group to half its size using an
5637 : interleaving scheme. For this try to compute whether all
5638 : elements needed for this load are in even or odd elements of
5639 : an even/odd decomposition with N consecutive elements.
5640 : Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
5641 : with N == 2. */
5642 : /* ??? Only an even number of lanes can be handed this way, but the
5643 : fallback below could work for any number. We have to make sure
5644 : to round up in that case. */
5645 79057 : gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
5646 11009 : unsigned even = 0, odd = 0;
5647 11009 : if ((group_lanes & 1) == 0)
5648 : {
5649 11009 : even = (1 << ceil_log2 (group_lanes)) - 1;
5650 11009 : odd = even;
5651 44713 : for (auto l : final_perm)
5652 : {
5653 11686 : even &= ~l.second;
5654 11686 : odd &= l.second;
5655 : }
5656 : }
5657 :
5658 : /* Now build an even or odd extraction from the unpermuted load. */
5659 79057 : lane_permutation_t perm;
5660 79057 : perm.create ((group_lanes + 1) / 2);
5661 79057 : unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
5662 79057 : unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
5663 79057 : if (even_level
5664 10092 : && group_lanes % (2 * even_level) == 0
5665 : /* ??? When code generating permutes we do not try to pun
5666 : to larger component modes so level != 1 isn't a natural
5667 : even/odd extract. Prefer one if possible. */
5668 10092 : && (even_level == 1 || !odd_level || odd_level != 1))
5669 : {
5670 : /* { 0, 1, ... 4, 5 ..., } */
5671 36375 : for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
5672 57438 : for (unsigned j = 0; j < even_level; ++j)
5673 28892 : perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
5674 : }
5675 68965 : else if (odd_level)
5676 : {
5677 : /* { ..., 2, 3, ... 6, 7 } */
5678 3150 : gcc_assert (group_lanes % (2 * odd_level) == 0);
5679 13714 : for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
5680 21182 : for (unsigned j = 0; j < odd_level; ++j)
5681 10618 : perm.quick_push
5682 10618 : (std::make_pair (0, (2 * i + 1) * odd_level + j));
5683 : }
5684 : else
5685 : {
5686 : /* As fallback extract all used lanes and fill to half the
5687 : group size by repeating the last element.
5688 : ??? This is quite a bad strathegy for re-use - we could
5689 : brute force our way to find more optimal filling lanes to
5690 : maximize re-use when looking at all loads from the group. */
5691 68078 : auto_bitmap l;
5692 272368 : for (auto p : final_perm)
5693 68134 : bitmap_set_bit (l, p.second);
5694 68078 : unsigned i = 0;
5695 68078 : bitmap_iterator bi;
5696 136212 : EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
5697 68134 : perm.quick_push (std::make_pair (0, i));
5698 272464 : while (perm.length () < (group_lanes + 1) / 2)
5699 68154 : perm.quick_push (perm.last ());
5700 68078 : }
5701 :
5702 : /* Update final_perm with the intermediate permute. */
5703 158791 : for (unsigned i = 0; i < final_perm.length (); ++i)
5704 : {
5705 79734 : unsigned l = final_perm[i].second;
5706 79734 : unsigned j;
5707 88182 : for (j = 0; j < perm.length (); ++j)
5708 88182 : if (perm[j].second == l)
5709 : {
5710 79734 : final_perm[i].second = j;
5711 79734 : break;
5712 : }
5713 79734 : gcc_assert (j < perm.length ());
5714 : }
5715 :
5716 : /* And create scalar stmts. */
5717 79057 : vec<stmt_vec_info> perm_stmts;
5718 79057 : perm_stmts.create (perm.length ());
5719 254855 : for (unsigned i = 0; i < perm.length (); ++i)
5720 175798 : perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
5721 :
5722 79057 : slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
5723 79057 : SLP_TREE_CHILDREN (p).quick_push (l0);
5724 79057 : SLP_TREE_LANE_PERMUTATION (p) = perm;
5725 79057 : SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
5726 79057 : SLP_TREE_LANES (p) = perm.length ();
5727 79057 : SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
5728 : /* ??? As we have scalar stmts for this intermediate permute we
5729 : could CSE it via bst_map but we do not want to pick up
5730 : another SLP node with a load permutation. We instead should
5731 : have a "local" CSE map here. */
5732 79057 : SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
5733 :
5734 : /* We now have a node for (group_lanes + 1) / 2 lanes. */
5735 79057 : l0 = p;
5736 79057 : }
5737 :
5738 : /* And finally from the ordered reduction node create the
5739 : permute to shuffle the lanes into the original load-permutation
5740 : order. We replace the original load node with this. */
5741 75224 : SLP_TREE_CODE (load) = VEC_PERM_EXPR;
5742 75224 : SLP_TREE_LOAD_PERMUTATION (load).release ();
5743 75224 : SLP_TREE_LANE_PERMUTATION (load) = final_perm;
5744 75224 : SLP_TREE_CHILDREN (load).create (1);
5745 75224 : SLP_TREE_CHILDREN (load).quick_push (l0);
5746 : }
5747 : }
5748 :
5749 : /* Transform SLP loads in the SLP graph created by SLP discovery to
5750 : group loads from the same group and lower load permutations that
5751 : are unlikely to be supported into a series of permutes.
5752 : In the degenerate case of having only single-lane SLP instances
5753 : this should result in a series of permute nodes emulating an
5754 : interleaving scheme. */
5755 :
5756 : static void
5757 470799 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
5758 : scalar_stmts_to_slp_tree_map_t *bst_map,
5759 : bool force_single_lane)
5760 : {
5761 : /* Gather and sort loads across all instances. */
5762 470799 : hash_set<slp_tree> visited;
5763 470799 : auto_vec<slp_tree> loads;
5764 2165297 : for (auto inst : loop_vinfo->slp_instances)
5765 754820 : vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
5766 470799 : if (loads.is_empty ())
5767 89249 : return;
5768 381550 : loads.qsort (vllp_cmp);
5769 :
5770 : /* Now process each dataref group separately. */
5771 381550 : unsigned firsti = 0;
5772 714720 : for (unsigned i = 1; i < loads.length (); ++i)
5773 : {
5774 333170 : slp_tree first = loads[firsti];
5775 333170 : slp_tree next = loads[i];
5776 333170 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
5777 333170 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
5778 333170 : if (STMT_VINFO_GROUPED_ACCESS (a0)
5779 157187 : && STMT_VINFO_GROUPED_ACCESS (b0)
5780 477306 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
5781 62442 : continue;
5782 : /* Now we have one or multiple SLP loads of the same group from
5783 : firsti to i - 1. */
5784 270728 : if (STMT_VINFO_GROUPED_ACCESS (a0))
5785 94745 : vect_lower_load_permutations (loop_vinfo, bst_map,
5786 94745 : make_array_slice (&loads[firsti],
5787 : i - firsti),
5788 : force_single_lane);
5789 : firsti = i;
5790 : }
5791 763100 : if (firsti < loads.length ()
5792 763100 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
5793 65740 : vect_lower_load_permutations (loop_vinfo, bst_map,
5794 65740 : make_array_slice (&loads[firsti],
5795 65740 : loads.length () - firsti),
5796 : force_single_lane);
5797 470799 : }
5798 :
5799 : /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
5800 : trees of packed scalar stmts if SLP is possible. */
5801 :
5802 : opt_result
5803 1105777 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
5804 : bool force_single_lane)
5805 : {
5806 1105777 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5807 1105777 : unsigned int i;
5808 1105777 : stmt_vec_info first_element;
5809 1105777 : slp_instance instance;
5810 :
5811 1105777 : DUMP_VECT_SCOPE ("vect_analyze_slp");
5812 :
5813 1105777 : unsigned limit = max_tree_size;
5814 :
5815 1105777 : scalar_stmts_to_slp_tree_map_t *bst_map
5816 1105777 : = new scalar_stmts_to_slp_tree_map_t ();
5817 :
5818 : /* Find SLP sequences starting from groups of grouped stores. */
5819 3125594 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5820 914309 : if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
5821 : slp_inst_kind_store, max_tree_size, &limit,
5822 : force_single_lane)
5823 914309 : && loop_vinfo)
5824 : {
5825 269 : release_scalar_stmts_to_slp_tree_map (bst_map);
5826 269 : return opt_result::failure_at (vect_location, "SLP build failed.\n");
5827 : }
5828 :
5829 : /* For loops also start SLP discovery from non-grouped stores. */
5830 1105508 : if (loop_vinfo)
5831 : {
5832 : data_reference_p dr;
5833 1620564 : FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
5834 1132085 : if (DR_IS_WRITE (dr))
5835 : {
5836 368441 : stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
5837 : /* Grouped stores are already handled above. */
5838 368441 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5839 99342 : continue;
5840 269099 : vec<stmt_vec_info> stmts;
5841 269099 : vec<stmt_vec_info> roots = vNULL;
5842 269099 : vec<tree> remain = vNULL;
5843 269099 : stmts.create (1);
5844 269099 : stmts.quick_push (stmt_info);
5845 269099 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5846 : stmts, roots, remain, max_tree_size,
5847 : &limit, bst_map, force_single_lane))
5848 : {
5849 6929 : release_scalar_stmts_to_slp_tree_map (bst_map);
5850 6929 : return opt_result::failure_at (vect_location,
5851 : "SLP build failed.\n");
5852 : }
5853 : }
5854 :
5855 : stmt_vec_info stmt_info;
5856 488519 : FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
5857 : {
5858 20 : vec<stmt_vec_info> stmts;
5859 20 : vec<stmt_vec_info> roots = vNULL;
5860 20 : vec<tree> remain = vNULL;
5861 20 : stmts.create (1);
5862 20 : stmts.quick_push (stmt_info);
5863 20 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5864 : stmts, roots, remain, max_tree_size,
5865 : &limit, bst_map, force_single_lane))
5866 : {
5867 0 : release_scalar_stmts_to_slp_tree_map (bst_map);
5868 0 : return opt_result::failure_at (vect_location,
5869 : "SLP build failed.\n");
5870 : }
5871 : }
5872 : }
5873 :
5874 1098579 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5875 : {
5876 1829331 : for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
5877 : {
5878 1219231 : vect_location = bb_vinfo->roots[i].roots[0]->stmt;
5879 : /* Apply patterns. */
5880 3812183 : for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
5881 5185904 : bb_vinfo->roots[i].stmts[j]
5882 2672028 : = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
5883 1219231 : if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
5884 1219231 : bb_vinfo->roots[i].stmts,
5885 1219231 : bb_vinfo->roots[i].roots,
5886 1219231 : bb_vinfo->roots[i].remain,
5887 : max_tree_size, &limit, bst_map, false))
5888 : {
5889 127790 : bb_vinfo->roots[i].roots = vNULL;
5890 127790 : bb_vinfo->roots[i].remain = vNULL;
5891 : }
5892 1219231 : bb_vinfo->roots[i].stmts = vNULL;
5893 : }
5894 : }
5895 :
5896 1098579 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5897 : {
5898 : /* Find SLP sequences starting from groups of reductions. */
5899 488479 : if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
5900 : bst_map, force_single_lane))
5901 : {
5902 3008 : release_scalar_stmts_to_slp_tree_map (bst_map);
5903 3008 : return opt_result::failure_at (vect_location, "SLP build failed.\n");
5904 : }
5905 :
5906 : /* Make sure to vectorize only-live stmts, usually inductions. */
5907 2187289 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
5908 1416093 : for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
5909 675953 : gsi_next (&gsi))
5910 : {
5911 685217 : gphi *lc_phi = *gsi;
5912 685217 : tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
5913 685217 : stmt_vec_info stmt_info;
5914 685217 : if (TREE_CODE (def) == SSA_NAME
5915 573196 : && !virtual_operand_p (def)
5916 297765 : && (stmt_info = loop_vinfo->lookup_def (def))
5917 266805 : && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
5918 266805 : && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
5919 207090 : && STMT_VINFO_LIVE_P (stmt_info)
5920 207090 : && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))
5921 790880 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
5922 : {
5923 105576 : vec<stmt_vec_info> stmts;
5924 105576 : vec<stmt_vec_info> roots = vNULL;
5925 105576 : vec<tree> remain = vNULL;
5926 105576 : stmts.create (1);
5927 105576 : stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
5928 105576 : if (! vect_build_slp_instance (vinfo,
5929 : slp_inst_kind_reduc_group,
5930 : stmts, roots, remain,
5931 : max_tree_size, &limit,
5932 : bst_map, force_single_lane))
5933 : {
5934 9264 : release_scalar_stmts_to_slp_tree_map (bst_map);
5935 9264 : return opt_result::failure_at (vect_location,
5936 : "SLP build failed.\n");
5937 : }
5938 : }
5939 9264 : }
5940 :
5941 : /* Find SLP sequences starting from gconds. */
5942 1181640 : for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
5943 : {
5944 277281 : auto cond_info = loop_vinfo->lookup_stmt (cond);
5945 :
5946 277281 : cond_info = vect_stmt_to_vectorize (cond_info);
5947 277281 : vec<stmt_vec_info> roots = vNULL;
5948 277281 : roots.safe_push (cond_info);
5949 277281 : gimple *stmt = STMT_VINFO_STMT (cond_info);
5950 277281 : tree args0 = gimple_cond_lhs (stmt);
5951 277281 : tree args1 = gimple_cond_rhs (stmt);
5952 :
5953 : /* These should be enforced by cond lowering, but if it failed
5954 : bail. */
5955 277281 : if (gimple_cond_code (stmt) != NE_EXPR
5956 276201 : || TREE_TYPE (args0) != boolean_type_node
5957 552916 : || !integer_zerop (args1))
5958 : {
5959 1646 : roots.release ();
5960 1646 : release_scalar_stmts_to_slp_tree_map (bst_map);
5961 1646 : return opt_result::failure_at (vect_location,
5962 : "SLP build failed.\n");
5963 : }
5964 :
5965 : /* An argument without a loop def will be codegened from vectorizing the
5966 : root gcond itself. As such we don't need to try to build an SLP tree
5967 : from them. It's highly likely that the resulting SLP tree here if both
5968 : arguments have a def will be incompatible, but we rely on it being split
5969 : later on. */
5970 275635 : auto varg = loop_vinfo->lookup_def (args0);
5971 275635 : vec<stmt_vec_info> stmts;
5972 275635 : vec<tree> remain = vNULL;
5973 275635 : stmts.create (1);
5974 275635 : stmts.quick_push (vect_stmt_to_vectorize (varg));
5975 :
5976 275635 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
5977 : stmts, roots, remain,
5978 : max_tree_size, &limit,
5979 : bst_map, force_single_lane))
5980 : {
5981 3762 : roots.release ();
5982 3762 : release_scalar_stmts_to_slp_tree_map (bst_map);
5983 3762 : return opt_result::failure_at (vect_location,
5984 : "SLP build failed.\n");
5985 : }
5986 : }
5987 : }
5988 :
5989 1080899 : hash_set<slp_tree> visited_patterns;
5990 1080899 : slp_tree_to_load_perm_map_t perm_cache;
5991 1080899 : slp_compat_nodes_map_t compat_cache;
5992 :
5993 : /* See if any patterns can be found in the SLP tree. */
5994 1080899 : bool pattern_found = false;
5995 3701708 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5996 1539910 : pattern_found |= vect_match_slp_patterns (instance, vinfo,
5997 : &visited_patterns, &perm_cache,
5998 : &compat_cache);
5999 :
6000 : /* If any were found optimize permutations of loads. */
6001 1080899 : if (pattern_found)
6002 : {
6003 285 : hash_map<slp_tree, slp_tree> load_map;
6004 3421 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6005 : {
6006 2851 : slp_tree root = SLP_INSTANCE_TREE (instance);
6007 2851 : optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
6008 : &load_map, root);
6009 : }
6010 285 : }
6011 :
6012 : /* Check whether we should force some SLP instances to use load/store-lanes
6013 : and do so by forcing SLP re-discovery with single lanes. We used
6014 : to cancel SLP when this applied to all instances in a loop but now
6015 : we decide this per SLP instance. It's important to do this only
6016 : after SLP pattern recognition. */
6017 1080899 : if (is_a <loop_vec_info> (vinfo))
6018 1225619 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6019 754820 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
6020 289225 : && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
6021 : {
6022 289225 : slp_tree slp_root = SLP_INSTANCE_TREE (instance);
6023 289225 : unsigned int group_size = SLP_TREE_LANES (slp_root);
6024 289225 : tree vectype = SLP_TREE_VECTYPE (slp_root);
6025 :
6026 289225 : stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
6027 289225 : gimple *rep = STMT_VINFO_STMT (rep_info);
6028 289225 : bool masked = (is_gimple_call (rep)
6029 2556 : && gimple_call_internal_p (rep)
6030 291761 : && internal_fn_mask_index
6031 2536 : (gimple_call_internal_fn (rep)) != -1);
6032 289205 : if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
6033 28845 : || slp_root->ldst_lanes
6034 318070 : || (vect_store_lanes_supported (vectype, group_size, masked)
6035 : == IFN_LAST))
6036 289225 : continue;
6037 :
6038 0 : auto_vec<slp_tree> loads;
6039 0 : hash_set<slp_tree> visited;
6040 0 : vect_gather_slp_loads (loads, slp_root, visited);
6041 :
6042 : /* Check whether any load in the SLP instance is possibly
6043 : permuted. */
6044 0 : bool loads_permuted = false;
6045 0 : slp_tree load_node;
6046 0 : unsigned j;
6047 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
6048 : {
6049 0 : if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
6050 0 : continue;
6051 : unsigned k;
6052 : stmt_vec_info load_info;
6053 0 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
6054 0 : if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
6055 : {
6056 : loads_permuted = true;
6057 : break;
6058 : }
6059 : }
6060 :
6061 : /* If the loads and stores can use load/store-lanes force re-discovery
6062 : with single lanes. */
6063 0 : if (loads_permuted)
6064 : {
6065 0 : bool can_use_lanes = true;
6066 : bool prefer_load_lanes = false;
6067 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
6068 0 : if (STMT_VINFO_GROUPED_ACCESS
6069 : (SLP_TREE_REPRESENTATIVE (load_node)))
6070 : {
6071 0 : stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
6072 : (SLP_TREE_REPRESENTATIVE (load_node));
6073 0 : rep = STMT_VINFO_STMT (stmt_vinfo);
6074 0 : masked = (is_gimple_call (rep)
6075 0 : && gimple_call_internal_p (rep)
6076 0 : && internal_fn_mask_index
6077 0 : (gimple_call_internal_fn (rep)));
6078 : /* Use SLP for strided accesses (or if we can't
6079 : load-lanes). */
6080 0 : if (STMT_VINFO_STRIDED_P (stmt_vinfo)
6081 0 : || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
6082 0 : || vect_load_lanes_supported
6083 0 : (SLP_TREE_VECTYPE (load_node),
6084 0 : DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
6085 : /* ??? During SLP re-discovery with a single lane
6086 : a masked grouped load will appear permuted and
6087 : discovery will fail. We have to rework this
6088 : on the discovery side - for now avoid ICEing. */
6089 0 : || masked)
6090 : {
6091 : can_use_lanes = false;
6092 : break;
6093 : }
6094 : /* Make sure that the target would prefer store-lanes
6095 : for at least one of the loads.
6096 :
6097 : ??? Perhaps we should instead require this for
6098 : all loads? */
6099 0 : prefer_load_lanes
6100 : = (prefer_load_lanes
6101 0 : || SLP_TREE_LANES (load_node) == group_size
6102 0 : || (vect_slp_prefer_store_lanes_p
6103 0 : (vinfo, stmt_vinfo,
6104 : SLP_TREE_VECTYPE (load_node), masked,
6105 : group_size, SLP_TREE_LANES (load_node))));
6106 : }
6107 :
6108 0 : if (can_use_lanes && prefer_load_lanes)
6109 : {
6110 0 : if (dump_enabled_p ())
6111 0 : dump_printf_loc (MSG_NOTE, vect_location,
6112 : "SLP instance %p can use load/store-lanes,"
6113 : " re-discovering with single-lanes\n",
6114 : (void *) instance);
6115 :
6116 0 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
6117 :
6118 0 : vect_free_slp_instance (instance);
6119 0 : limit = max_tree_size;
6120 0 : bool res = vect_analyze_slp_instance (vinfo, bst_map,
6121 : stmt_info,
6122 : slp_inst_kind_store,
6123 : max_tree_size, &limit,
6124 : true);
6125 0 : gcc_assert (res);
6126 0 : auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
6127 0 : LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
6128 : }
6129 : }
6130 0 : }
6131 :
6132 : /* When we end up with load permutations that we cannot possibly handle,
6133 : like those requiring three vector inputs, lower them using interleaving
6134 : like schemes. */
6135 1080899 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6136 : {
6137 470799 : vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
6138 470799 : if (dump_enabled_p ())
6139 : {
6140 19939 : dump_printf_loc (MSG_NOTE, vect_location,
6141 : "SLP graph after lowering permutations:\n");
6142 19939 : hash_set<slp_tree> visited;
6143 88917 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6144 29125 : vect_print_slp_graph (MSG_NOTE, vect_location,
6145 : SLP_INSTANCE_TREE (instance), visited);
6146 19939 : }
6147 : }
6148 :
6149 1080899 : release_scalar_stmts_to_slp_tree_map (bst_map);
6150 :
6151 1080899 : if (pattern_found && dump_enabled_p ())
6152 : {
6153 18 : dump_printf_loc (MSG_NOTE, vect_location,
6154 : "Pattern matched SLP tree\n");
6155 18 : hash_set<slp_tree> visited;
6156 90 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6157 36 : vect_print_slp_graph (MSG_NOTE, vect_location,
6158 : SLP_INSTANCE_TREE (instance), visited);
6159 18 : }
6160 :
6161 1080899 : return opt_result::success ();
6162 1080899 : }
6163 :
6164 : /* Estimates the cost of inserting layout changes into the SLP graph.
6165 : It can also say that the insertion is impossible. */
6166 :
6167 : struct slpg_layout_cost
6168 : {
6169 10563064 : slpg_layout_cost () = default;
6170 : slpg_layout_cost (sreal, bool);
6171 :
6172 497140 : static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
6173 5487987 : bool is_possible () const { return depth != sreal::max (); }
6174 :
6175 : bool operator== (const slpg_layout_cost &) const;
6176 : bool operator!= (const slpg_layout_cost &) const;
6177 :
6178 : bool is_better_than (const slpg_layout_cost &, bool) const;
6179 :
6180 : void add_parallel_cost (const slpg_layout_cost &);
6181 : void add_serial_cost (const slpg_layout_cost &);
6182 : void split (unsigned int);
6183 :
6184 : /* The longest sequence of layout changes needed during any traversal
6185 : of the partition dag, weighted by execution frequency.
6186 :
6187 : This is the most important metric when optimizing for speed, since
6188 : it helps to ensure that we keep the number of operations on
6189 : critical paths to a minimum. */
6190 : sreal depth = 0;
6191 :
6192 : /* An estimate of the total number of operations needed. It is weighted by
6193 : execution frequency when optimizing for speed but not when optimizing for
6194 : size. In order to avoid double-counting, a node with a fanout of N will
6195 : distribute 1/N of its total cost to each successor.
6196 :
6197 : This is the most important metric when optimizing for size, since
6198 : it helps to keep the total number of operations to a minimum, */
6199 : sreal total = 0;
6200 : };
6201 :
6202 : /* Construct costs for a node with weight WEIGHT. A higher weight
6203 : indicates more frequent execution. IS_FOR_SIZE is true if we are
6204 : optimizing for size rather than speed. */
6205 :
6206 1296275 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
6207 1297143 : : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
6208 : {
6209 1296275 : }
6210 :
6211 : bool
6212 0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
6213 : {
6214 0 : return depth == other.depth && total == other.total;
6215 : }
6216 :
6217 : bool
6218 0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
6219 : {
6220 0 : return !operator== (other);
6221 : }
6222 :
6223 : /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
6224 : true if we are optimizing for size rather than speed. */
6225 :
6226 : bool
6227 320448 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
6228 : bool is_for_size) const
6229 : {
6230 320448 : if (is_for_size)
6231 : {
6232 382 : if (total != other.total)
6233 159 : return total < other.total;
6234 223 : return depth < other.depth;
6235 : }
6236 : else
6237 : {
6238 320066 : if (depth != other.depth)
6239 136331 : return depth < other.depth;
6240 183735 : return total < other.total;
6241 : }
6242 : }
6243 :
6244 : /* Increase the costs to account for something with cost INPUT_COST
6245 : happening in parallel with the current costs. */
6246 :
6247 : void
6248 384014 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
6249 : {
6250 384014 : depth = std::max (depth, input_cost.depth);
6251 384014 : total += input_cost.total;
6252 384014 : }
6253 :
6254 : /* Increase the costs to account for something with cost INPUT_COST
6255 : happening in series with the current costs. */
6256 :
6257 : void
6258 1554342 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
6259 : {
6260 1554342 : depth += other.depth;
6261 1554342 : total += other.total;
6262 1554342 : }
6263 :
6264 : /* Split the total cost among TIMES successors or predecessors. */
6265 :
6266 : void
6267 1293208 : slpg_layout_cost::split (unsigned int times)
6268 : {
6269 1293208 : if (times > 1)
6270 566190 : total /= times;
6271 1293208 : }
6272 :
6273 : /* Information about one node in the SLP graph, for use during
6274 : vect_optimize_slp_pass. */
6275 :
6276 : struct slpg_vertex
6277 : {
6278 9884527 : slpg_vertex (slp_tree node_) : node (node_) {}
6279 :
6280 : /* The node itself. */
6281 : slp_tree node;
6282 :
6283 : /* Which partition the node belongs to, or -1 if none. Nodes outside of
6284 : partitions are flexible; they can have whichever layout consumers
6285 : want them to have. */
6286 : int partition = -1;
6287 :
6288 : /* The number of nodes that directly use the result of this one
6289 : (i.e. the number of nodes that count this one as a child). */
6290 : unsigned int out_degree = 0;
6291 :
6292 : /* The execution frequency of the node. */
6293 : sreal weight = 0;
6294 :
6295 : /* The total execution frequency of all nodes that directly use the
6296 : result of this one. */
6297 : sreal out_weight = 0;
6298 : };
6299 :
6300 : /* Information about one partition of the SLP graph, for use during
6301 : vect_optimize_slp_pass. */
6302 :
6303 : struct slpg_partition_info
6304 : {
6305 : /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
6306 : of m_partitioned_nodes. */
6307 : unsigned int node_begin = 0;
6308 : unsigned int node_end = 0;
6309 :
6310 : /* Which layout we've chosen to use for this partition, or -1 if
6311 : we haven't picked one yet. */
6312 : int layout = -1;
6313 :
6314 : /* The number of predecessors and successors in the partition dag.
6315 : The predecessors always have lower partition numbers and the
6316 : successors always have higher partition numbers.
6317 :
6318 : Note that the directions of these edges are not necessarily the
6319 : same as in the data flow graph. For example, if an SCC has separate
6320 : partitions for an inner loop and an outer loop, the inner loop's
6321 : partition will have at least two incoming edges from the outer loop's
6322 : partition: one for a live-in value and one for a live-out value.
6323 : In data flow terms, one of these edges would also be from the outer loop
6324 : to the inner loop, but the other would be in the opposite direction. */
6325 : unsigned int in_degree = 0;
6326 : unsigned int out_degree = 0;
6327 : };
6328 :
6329 : /* Information about the costs of using a particular layout for a
6330 : particular partition. It can also say that the combination is
6331 : impossible. */
6332 :
6333 : struct slpg_partition_layout_costs
6334 : {
6335 1565677 : bool is_possible () const { return internal_cost.is_possible (); }
6336 55522 : void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
6337 :
6338 : /* The costs inherited from predecessor partitions. */
6339 : slpg_layout_cost in_cost;
6340 :
6341 : /* The inherent cost of the layout within the node itself. For example,
6342 : this is nonzero for a load if choosing a particular layout would require
6343 : the load to permute the loaded elements. It is nonzero for a
6344 : VEC_PERM_EXPR if the permutation cannot be eliminated or converted
6345 : to full-vector moves. */
6346 : slpg_layout_cost internal_cost;
6347 :
6348 : /* The costs inherited from successor partitions. */
6349 : slpg_layout_cost out_cost;
6350 : };
6351 :
6352 : /* This class tries to optimize the layout of vectors in order to avoid
6353 : unnecessary shuffling. At the moment, the set of possible layouts are
6354 : restricted to bijective permutations.
6355 :
6356 : The goal of the pass depends on whether we're optimizing for size or
6357 : for speed. When optimizing for size, the goal is to reduce the overall
6358 : number of layout changes (including layout changes implied by things
6359 : like load permutations). When optimizing for speed, the goal is to
6360 : reduce the maximum latency attributable to layout changes on any
6361 : non-cyclical path through the data flow graph.
6362 :
6363 : For example, when optimizing a loop nest for speed, we will prefer
6364 : to make layout changes outside of a loop rather than inside of a loop,
6365 : and will prefer to make layout changes in parallel rather than serially,
6366 : even if that increases the overall number of layout changes.
6367 :
6368 : The high-level procedure is:
6369 :
6370 : (1) Build a graph in which edges go from uses (parents) to definitions
6371 : (children).
6372 :
6373 : (2) Divide the graph into a dag of strongly-connected components (SCCs).
6374 :
6375 : (3) When optimizing for speed, partition the nodes in each SCC based
6376 : on their containing cfg loop. When optimizing for size, treat
6377 : each SCC as a single partition.
6378 :
6379 : This gives us a dag of partitions. The goal is now to assign a
6380 : layout to each partition.
6381 :
6382 : (4) Construct a set of vector layouts that are worth considering.
6383 : Record which nodes must keep their current layout.
6384 :
6385 : (5) Perform a forward walk over the partition dag (from loads to stores)
6386 : accumulating the "forward" cost of using each layout. When visiting
6387 : each partition, assign a tentative choice of layout to the partition
6388 : and use that choice when calculating the cost of using a different
6389 : layout in successor partitions.
6390 :
6391 : (6) Perform a backward walk over the partition dag (from stores to loads),
6392 : accumulating the "backward" cost of using each layout. When visiting
6393 : each partition, make a final choice of layout for that partition based
6394 : on the accumulated forward costs (from (5)) and backward costs
6395 : (from (6)).
6396 :
6397 : (7) Apply the chosen layouts to the SLP graph.
6398 :
6399 : For example, consider the SLP statements:
6400 :
6401 : S1: a_1 = load
6402 : loop:
6403 : S2: a_2 = PHI<a_1, a_3>
6404 : S3: b_1 = load
6405 : S4: a_3 = a_2 + b_1
6406 : exit:
6407 : S5: a_4 = PHI<a_3>
6408 : S6: store a_4
6409 :
6410 : S2 and S4 form an SCC and are part of the same loop. Every other
6411 : statement is in a singleton SCC. In this example there is a one-to-one
6412 : mapping between SCCs and partitions and the partition dag looks like this;
6413 :
6414 : S1 S3
6415 : \ /
6416 : S2+S4
6417 : |
6418 : S5
6419 : |
6420 : S6
6421 :
6422 : S2, S3 and S4 will have a higher execution frequency than the other
6423 : statements, so when optimizing for speed, the goal is to avoid any
6424 : layout changes:
6425 :
6426 : - within S3
6427 : - within S2+S4
6428 : - on the S3->S2+S4 edge
6429 :
6430 : For example, if S3 was originally a reversing load, the goal of the
6431 : pass is to make it an unreversed load and change the layout on the
6432 : S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
6433 : on S1->S2+S4 and S5->S6 would also be acceptable.)
6434 :
6435 : The difference between SCCs and partitions becomes important if we
6436 : add an outer loop:
6437 :
6438 : S1: a_1 = ...
6439 : loop1:
6440 : S2: a_2 = PHI<a_1, a_6>
6441 : S3: b_1 = load
6442 : S4: a_3 = a_2 + b_1
6443 : loop2:
6444 : S5: a_4 = PHI<a_3, a_5>
6445 : S6: c_1 = load
6446 : S7: a_5 = a_4 + c_1
6447 : exit2:
6448 : S8: a_6 = PHI<a_5>
6449 : S9: store a_6
6450 : exit1:
6451 :
6452 : Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
6453 : for speed, we usually do not want restrictions in the outer loop to "infect"
6454 : the decision for the inner loop. For example, if an outer-loop node
6455 : in the SCC contains a statement with a fixed layout, that should not
6456 : prevent the inner loop from using a different layout. Conversely,
6457 : the inner loop should not dictate a layout to the outer loop: if the
6458 : outer loop does a lot of computation, then it may not be efficient to
6459 : do all of that computation in the inner loop's preferred layout.
6460 :
6461 : So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
6462 : and S5+S7 (inner). We also try to arrange partitions so that:
6463 :
6464 : - the partition for an outer loop comes before the partition for
6465 : an inner loop
6466 :
6467 : - if a sibling loop A dominates a sibling loop B, A's partition
6468 : comes before B's
6469 :
6470 : This gives the following partition dag for the example above:
6471 :
6472 : S1 S3
6473 : \ /
6474 : S2+S4+S8 S6
6475 : | \\ /
6476 : | S5+S7
6477 : |
6478 : S9
6479 :
6480 : There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
6481 : one for a reversal of the edge S7->S8.
6482 :
6483 : The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
6484 : for S2+S4+S8 therefore has to balance the cost of using the outer loop's
6485 : preferred layout against the cost of changing the layout on entry to the
6486 : inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
6487 :
6488 : Although this works well when optimizing for speed, it has the downside
6489 : when optimizing for size that the choice of layout for S5+S7 is completely
6490 : independent of S9, which lessens the chance of reducing the overall number
6491 : of permutations. We therefore do not partition SCCs when optimizing
6492 : for size.
6493 :
6494 : To give a concrete example of the difference between optimizing
6495 : for size and speed, consider:
6496 :
6497 : a[0] = (b[1] << c[3]) - d[1];
6498 : a[1] = (b[0] << c[2]) - d[0];
6499 : a[2] = (b[3] << c[1]) - d[3];
6500 : a[3] = (b[2] << c[0]) - d[2];
6501 :
6502 : There are three different layouts here: one for a, one for b and d,
6503 : and one for c. When optimizing for speed it is better to permute each
6504 : of b, c and d into the order required by a, since those permutations
6505 : happen in parallel. But when optimizing for size, it is better to:
6506 :
6507 : - permute c into the same order as b
6508 : - do the arithmetic
6509 : - permute the result into the order required by a
6510 :
6511 : This gives 2 permutations rather than 3. */
6512 :
6513 : class vect_optimize_slp_pass
6514 : {
6515 : public:
6516 676294 : vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
6517 : void run ();
6518 :
6519 : private:
6520 : /* Graph building. */
6521 : struct loop *containing_loop (slp_tree);
6522 : bool is_cfg_latch_edge (graph_edge *);
6523 : void build_vertices (hash_set<slp_tree> &, slp_tree);
6524 : void build_vertices ();
6525 : void build_graph ();
6526 :
6527 : /* Partitioning. */
6528 : void create_partitions ();
6529 : template<typename T> void for_each_partition_edge (unsigned int, T);
6530 :
6531 : /* Layout selection. */
6532 : bool is_compatible_layout (slp_tree, unsigned int);
6533 : bool is_compatible_layout (const slpg_partition_info &, unsigned int);
6534 : int change_layout_cost (slp_tree, unsigned int, unsigned int);
6535 : slpg_partition_layout_costs &partition_layout_costs (unsigned int,
6536 : unsigned int);
6537 : void change_vec_perm_layout (slp_tree, lane_permutation_t &,
6538 : int, unsigned int);
6539 : int internal_node_cost (slp_tree, int, unsigned int);
6540 : void start_choosing_layouts ();
6541 : bool legitimize ();
6542 :
6543 : /* Cost propagation. */
6544 : slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
6545 : unsigned int, unsigned int);
6546 : slpg_layout_cost total_in_cost (unsigned int);
6547 : slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
6548 : slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
6549 : void forward_pass ();
6550 : void backward_pass ();
6551 :
6552 : /* Rematerialization. */
6553 : slp_tree get_result_with_layout (slp_tree, unsigned int);
6554 : void materialize ();
6555 :
6556 : /* Clean-up. */
6557 : void remove_redundant_permutations ();
6558 :
6559 : /* Masked load lanes discovery. */
6560 : void decide_masked_load_lanes ();
6561 :
6562 : void dump ();
6563 :
6564 : vec_info *m_vinfo;
6565 :
6566 : /* True if we should optimize the graph for size, false if we should
6567 : optimize it for speed. (It wouldn't be easy to make this decision
6568 : more locally.) */
6569 : bool m_optimize_size;
6570 :
6571 : /* A graph of all SLP nodes, with edges leading from uses to definitions.
6572 : In other words, a node's predecessors are its slp_tree parents and
6573 : a node's successors are its slp_tree children. */
6574 : graph *m_slpg = nullptr;
6575 :
6576 : /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
6577 : auto_vec<slpg_vertex> m_vertices;
6578 :
6579 : /* The list of all leaves of M_SLPG. such as external definitions, constants,
6580 : and loads. */
6581 : auto_vec<int> m_leafs;
6582 :
6583 : /* This array has one entry for every vector layout that we're considering.
6584 : Element 0 is null and indicates "no change". Other entries describe
6585 : permutations that are inherent in the current graph and that we would
6586 : like to reverse if possible.
6587 :
6588 : For example, a permutation { 1, 2, 3, 0 } means that something has
6589 : effectively been permuted in that way, such as a load group
6590 : { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
6591 : We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
6592 : in order to put things "back" in order. */
6593 : auto_vec<vec<unsigned> > m_perms;
6594 :
6595 : /* A partitioning of the nodes for which a layout must be chosen.
6596 : Each partition represents an <SCC, cfg loop> pair; that is,
6597 : nodes in different SCCs belong to different partitions, and nodes
6598 : within an SCC can be further partitioned according to a containing
6599 : cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
6600 :
6601 : - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
6602 : from leaves (such as loads) to roots (such as stores).
6603 :
6604 : - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
6605 : auto_vec<slpg_partition_info> m_partitions;
6606 :
6607 : /* The list of all nodes for which a layout must be chosen. Nodes for
6608 : partition P come before the nodes for partition P+1. Nodes within a
6609 : partition are in reverse postorder. */
6610 : auto_vec<unsigned int> m_partitioned_nodes;
6611 :
6612 : /* Index P * num-layouts + L contains the cost of using layout L
6613 : for partition P. */
6614 : auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
6615 :
6616 : /* Index N * num-layouts + L, if nonnull, is a node that provides the
6617 : original output of node N adjusted to have layout L. */
6618 : auto_vec<slp_tree> m_node_layouts;
6619 : };
6620 :
6621 : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
6622 : Also record whether we should optimize anything for speed rather
6623 : than size. */
6624 :
6625 : void
6626 10690847 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
6627 : slp_tree node)
6628 : {
6629 10690847 : unsigned i;
6630 10690847 : slp_tree child;
6631 :
6632 10690847 : if (visited.add (node))
6633 10690847 : return;
6634 :
6635 9884527 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
6636 : {
6637 7783816 : basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
6638 6936210 : if (optimize_bb_for_speed_p (bb))
6639 6817800 : m_optimize_size = false;
6640 : }
6641 :
6642 9884527 : node->vertex = m_vertices.length ();
6643 9884527 : m_vertices.safe_push (slpg_vertex (node));
6644 :
6645 9884527 : bool leaf = true;
6646 9884527 : bool force_leaf = false;
6647 18524390 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6648 8639863 : if (child)
6649 : {
6650 7783545 : leaf = false;
6651 7783545 : build_vertices (visited, child);
6652 : }
6653 : else
6654 : force_leaf = true;
6655 : /* Since SLP discovery works along use-def edges all cycles have an
6656 : entry - but there's the exception of cycles where we do not handle
6657 : the entry explicitly (but with a NULL SLP node), like some reductions
6658 : and inductions. Force those SLP PHIs to act as leafs to make them
6659 : backwards reachable. */
6660 9884527 : if (leaf || force_leaf)
6661 4878512 : m_leafs.safe_push (node->vertex);
6662 : }
6663 :
6664 : /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
6665 :
6666 : void
6667 1352588 : vect_optimize_slp_pass::build_vertices ()
6668 : {
6669 1352588 : hash_set<slp_tree> visited;
6670 1352588 : unsigned i;
6671 1352588 : slp_instance instance;
6672 1352588 : m_vertices.truncate (0);
6673 1352588 : m_leafs.truncate (0);
6674 6965066 : FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
6675 2907302 : build_vertices (visited, SLP_INSTANCE_TREE (instance));
6676 1352588 : }
6677 :
6678 : /* Apply (reverse) bijectite PERM to VEC. */
6679 :
6680 : template <class T>
6681 : static void
6682 207113 : vect_slp_permute (vec<unsigned> perm,
6683 : vec<T> &vec, bool reverse)
6684 : {
6685 207113 : auto_vec<T, 64> saved;
6686 207113 : saved.create (vec.length ());
6687 672043 : for (unsigned i = 0; i < vec.length (); ++i)
6688 464930 : saved.quick_push (vec[i]);
6689 :
6690 207113 : if (reverse)
6691 : {
6692 1333885 : for (unsigned i = 0; i < vec.length (); ++i)
6693 463706 : vec[perm[i]] = saved[i];
6694 670275 : for (unsigned i = 0; i < vec.length (); ++i)
6695 820309 : gcc_assert (vec[perm[i]] == saved[i]);
6696 : }
6697 : else
6698 : {
6699 3536 : for (unsigned i = 0; i < vec.length (); ++i)
6700 1224 : vec[i] = saved[perm[i]];
6701 208337 : for (unsigned i = 0; i < vec.length (); ++i)
6702 1836 : gcc_assert (vec[i] == saved[perm[i]]);
6703 : }
6704 207113 : }
6705 :
6706 : /* Return the cfg loop that contains NODE. */
6707 :
6708 : struct loop *
6709 3868912 : vect_optimize_slp_pass::containing_loop (slp_tree node)
6710 : {
6711 3868912 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
6712 3868912 : if (!rep)
6713 5300 : return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
6714 4300594 : return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
6715 : }
6716 :
6717 : /* Return true if UD (an edge from a use to a definition) is associated
6718 : with a loop latch edge in the cfg. */
6719 :
6720 : bool
6721 7783545 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
6722 : {
6723 7783545 : slp_tree use = m_vertices[ud->src].node;
6724 7783545 : slp_tree def = m_vertices[ud->dest].node;
6725 7783545 : if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
6726 7783545 : || SLP_TREE_PERMUTE_P (use))
6727 7473426 : || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
6728 : return false;
6729 :
6730 4538132 : stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
6731 4538132 : return (is_a<gphi *> (use_rep->stmt)
6732 372244 : && bb_loop_header_p (gimple_bb (use_rep->stmt))
6733 4748180 : && containing_loop (def) == containing_loop (use));
6734 : }
6735 :
6736 : /* Build the graph. Mark edges that correspond to cfg loop latch edges with
6737 : a nonnull data field. */
6738 :
6739 : void
6740 1352588 : vect_optimize_slp_pass::build_graph ()
6741 : {
6742 1352588 : m_optimize_size = true;
6743 1352588 : build_vertices ();
6744 :
6745 2705176 : m_slpg = new_graph (m_vertices.length ());
6746 13942291 : for (slpg_vertex &v : m_vertices)
6747 29573200 : for (slp_tree child : SLP_TREE_CHILDREN (v.node))
6748 8639863 : if (child)
6749 : {
6750 7783545 : graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
6751 7783545 : if (is_cfg_latch_edge (ud))
6752 201472 : ud->data = this;
6753 : }
6754 1352588 : }
6755 :
6756 : /* Return true if E corresponds to a loop latch edge in the cfg. */
6757 :
6758 : static bool
6759 3992236 : skip_cfg_latch_edges (graph_edge *e)
6760 : {
6761 3992236 : return e->data;
6762 : }
6763 :
6764 : /* Create the node partitions. */
6765 :
6766 : void
6767 676294 : vect_optimize_slp_pass::create_partitions ()
6768 : {
6769 : /* Calculate a postorder of the graph, ignoring edges that correspond
6770 : to natural latch edges in the cfg. Reading the vector from the end
6771 : to the beginning gives the reverse postorder. */
6772 676294 : auto_vec<int> initial_rpo;
6773 1352588 : graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
6774 : false, NULL, skip_cfg_latch_edges);
6775 2028882 : gcc_assert (initial_rpo.length () == m_vertices.length ());
6776 :
6777 : /* Calculate the strongly connected components of the graph. */
6778 676294 : auto_vec<int> scc_grouping;
6779 676294 : unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
6780 :
6781 : /* Create a new index order in which all nodes from the same SCC are
6782 : consecutive. Use scc_pos to record the index of the first node in
6783 : each SCC. */
6784 676294 : auto_vec<unsigned int> scc_pos (num_sccs);
6785 676294 : int last_component = -1;
6786 676294 : unsigned int node_count = 0;
6787 6970875 : for (unsigned int node_i : scc_grouping)
6788 : {
6789 4941993 : if (last_component != m_slpg->vertices[node_i].component)
6790 : {
6791 4815412 : last_component = m_slpg->vertices[node_i].component;
6792 9630824 : gcc_assert (last_component == int (scc_pos.length ()));
6793 4815412 : scc_pos.quick_push (node_count);
6794 : }
6795 4941993 : node_count += 1;
6796 : }
6797 1352588 : gcc_assert (node_count == initial_rpo.length ()
6798 : && last_component + 1 == int (num_sccs));
6799 :
6800 : /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
6801 : inside each SCC following the RPO we calculated above. The fact that
6802 : we ignored natural latch edges when calculating the RPO should ensure
6803 : that, for natural loop nests:
6804 :
6805 : - the first node that we encounter in a cfg loop is the loop header phi
6806 : - the loop header phis are in dominance order
6807 :
6808 : Arranging for this is an optimization (see below) rather than a
6809 : correctness issue. Unnatural loops with a tangled mess of backedges
6810 : will still work correctly, but might give poorer results.
6811 :
6812 : Also update scc_pos so that it gives 1 + the index of the last node
6813 : in the SCC. */
6814 676294 : m_partitioned_nodes.safe_grow (node_count);
6815 6294581 : for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
6816 : {
6817 4941993 : unsigned int node_i = initial_rpo[old_i];
6818 4941993 : unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
6819 4941993 : m_partitioned_nodes[new_i] = node_i;
6820 : }
6821 :
6822 : /* When optimizing for speed, partition each SCC based on the containing
6823 : cfg loop. The order we constructed above should ensure that, for natural
6824 : cfg loops, we'll create sub-SCC partitions for outer loops before
6825 : the corresponding sub-SCC partitions for inner loops. Similarly,
6826 : when one sibling loop A dominates another sibling loop B, we should
6827 : create a sub-SCC partition for A before a sub-SCC partition for B.
6828 :
6829 : As above, nothing depends for correctness on whether this achieves
6830 : a natural nesting, but we should get better results when it does. */
6831 1352588 : m_partitions.reserve (m_vertices.length ());
6832 676294 : unsigned int next_partition_i = 0;
6833 676294 : hash_map<struct loop *, int> loop_partitions;
6834 676294 : unsigned int rpo_begin = 0;
6835 676294 : unsigned int num_partitioned_nodes = 0;
6836 6844294 : for (unsigned int rpo_end : scc_pos)
6837 : {
6838 4815412 : loop_partitions.empty ();
6839 : unsigned int partition_i = next_partition_i;
6840 9757405 : for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
6841 : {
6842 : /* Handle externals and constants optimistically throughout.
6843 : But treat existing vectors as fixed since we do not handle
6844 : permuting them. */
6845 4941993 : unsigned int node_i = m_partitioned_nodes[rpo_i];
6846 4941993 : auto &vertex = m_vertices[node_i];
6847 4941993 : if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
6848 501031 : && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
6849 4944213 : || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
6850 1468858 : vertex.partition = -1;
6851 : else
6852 : {
6853 3473135 : bool existed;
6854 3473135 : if (m_optimize_size)
6855 24319 : existed = next_partition_i > partition_i;
6856 : else
6857 : {
6858 3448816 : struct loop *loop = containing_loop (vertex.node);
6859 3448816 : auto &entry = loop_partitions.get_or_insert (loop, &existed);
6860 3448816 : if (!existed)
6861 3323252 : entry = next_partition_i;
6862 3448816 : partition_i = entry;
6863 : }
6864 3473135 : if (!existed)
6865 : {
6866 3347493 : m_partitions.quick_push (slpg_partition_info ());
6867 3347493 : next_partition_i += 1;
6868 : }
6869 3473135 : vertex.partition = partition_i;
6870 3473135 : num_partitioned_nodes += 1;
6871 3473135 : m_partitions[partition_i].node_end += 1;
6872 : }
6873 : }
6874 4815412 : rpo_begin = rpo_end;
6875 : }
6876 :
6877 : /* Assign ranges of consecutive node indices to each partition,
6878 : in partition order. Start with node_end being the same as
6879 : node_begin so that the next loop can use it as a counter. */
6880 676294 : unsigned int node_begin = 0;
6881 5376375 : for (auto &partition : m_partitions)
6882 : {
6883 3347493 : partition.node_begin = node_begin;
6884 3347493 : node_begin += partition.node_end;
6885 3347493 : partition.node_end = partition.node_begin;
6886 : }
6887 676294 : gcc_assert (node_begin == num_partitioned_nodes);
6888 :
6889 : /* Finally build the list of nodes in partition order. */
6890 676294 : m_partitioned_nodes.truncate (num_partitioned_nodes);
6891 5618287 : for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
6892 : {
6893 4941993 : int partition_i = m_vertices[node_i].partition;
6894 4941993 : if (partition_i >= 0)
6895 : {
6896 3473135 : unsigned int order_i = m_partitions[partition_i].node_end++;
6897 3473135 : m_partitioned_nodes[order_i] = node_i;
6898 : }
6899 : }
6900 676294 : }
6901 :
6902 : /* Look for edges from earlier partitions into node NODE_I and edges from
6903 : node NODE_I into later partitions. Call:
6904 :
6905 : FN (ud, other_node_i)
6906 :
6907 : for each such use-to-def edge ud, where other_node_i is the node at the
6908 : other end of the edge. */
6909 :
6910 : template<typename T>
6911 : void
6912 3913556 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
6913 : {
6914 3913556 : int partition_i = m_vertices[node_i].partition;
6915 3913556 : for (graph_edge *pred = m_slpg->vertices[node_i].pred;
6916 6792827 : pred; pred = pred->pred_next)
6917 : {
6918 2879271 : int src_partition_i = m_vertices[pred->src].partition;
6919 2879271 : if (src_partition_i >= 0 && src_partition_i != partition_i)
6920 2559336 : fn (pred, pred->src);
6921 : }
6922 3913556 : for (graph_edge *succ = m_slpg->vertices[node_i].succ;
6923 8415691 : succ; succ = succ->succ_next)
6924 : {
6925 4502135 : int dest_partition_i = m_vertices[succ->dest].partition;
6926 4502135 : if (dest_partition_i >= 0 && dest_partition_i != partition_i)
6927 2587243 : fn (succ, succ->dest);
6928 : }
6929 3913556 : }
6930 :
6931 : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6932 : that NODE would operate on. This test is independent of NODE's actual
6933 : operation. */
6934 :
6935 : bool
6936 1762004 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
6937 : unsigned int layout_i)
6938 : {
6939 1762004 : if (layout_i == 0)
6940 : return true;
6941 :
6942 1007222 : if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
6943 14802 : return false;
6944 :
6945 : return true;
6946 : }
6947 :
6948 : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6949 : that NODE would operate on for each NODE in PARTITION.
6950 : This test is independent of NODE's actual operations. */
6951 :
6952 : bool
6953 17733 : vect_optimize_slp_pass::is_compatible_layout (const slpg_partition_info
6954 : &partition,
6955 : unsigned int layout_i)
6956 : {
6957 35738 : for (unsigned int order_i = partition.node_begin;
6958 35738 : order_i < partition.node_end; ++order_i)
6959 : {
6960 18071 : unsigned int node_i = m_partitioned_nodes[order_i];
6961 18071 : auto &vertex = m_vertices[node_i];
6962 :
6963 : /* The layout is incompatible if it is individually incompatible
6964 : with any node in the partition. */
6965 18071 : if (!is_compatible_layout (vertex.node, layout_i))
6966 : return false;
6967 : }
6968 : return true;
6969 : }
6970 :
6971 : /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
6972 : to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
6973 : layouts is incompatible with NODE or if the change is not possible for
6974 : some other reason.
6975 :
6976 : The properties taken from NODE include the number of lanes and the
6977 : vector type. The actual operation doesn't matter. */
6978 :
6979 : int
6980 753796 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
6981 : unsigned int from_layout_i,
6982 : unsigned int to_layout_i)
6983 : {
6984 753796 : if (!is_compatible_layout (node, from_layout_i)
6985 753796 : || !is_compatible_layout (node, to_layout_i))
6986 563 : return -1;
6987 :
6988 753233 : if (from_layout_i == to_layout_i)
6989 : return 0;
6990 :
6991 319259 : auto_vec<slp_tree, 1> children (1);
6992 319259 : children.quick_push (node);
6993 319259 : auto_lane_permutation_t perm (SLP_TREE_LANES (node));
6994 319259 : if (from_layout_i > 0)
6995 895698 : for (unsigned int i : m_perms[from_layout_i])
6996 391719 : perm.quick_push ({ 0, i });
6997 : else
6998 486613 : for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
6999 335347 : perm.quick_push ({ 0, i });
7000 319259 : if (to_layout_i > 0)
7001 151693 : vect_slp_permute (m_perms[to_layout_i], perm, true);
7002 319259 : auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
7003 : children, false);
7004 319259 : if (count >= 0)
7005 315109 : return MAX (count, 1);
7006 :
7007 : /* ??? In principle we could try changing via layout 0, giving two
7008 : layout changes rather than 1. Doing that would require
7009 : corresponding support in get_result_with_layout. */
7010 : return -1;
7011 319259 : }
7012 :
7013 : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
7014 :
7015 : inline slpg_partition_layout_costs &
7016 1078769 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
7017 : unsigned int layout_i)
7018 : {
7019 2157538 : return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
7020 : }
7021 :
7022 : /* Change PERM in one of two ways:
7023 :
7024 : - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
7025 : chosen for child I of NODE.
7026 :
7027 : - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
7028 :
7029 : In both cases, arrange for the output to have layout OUT_LAYOUT_I */
7030 :
7031 : void
7032 30626 : vect_optimize_slp_pass::
7033 : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
7034 : int in_layout_i, unsigned int out_layout_i)
7035 : {
7036 177996 : for (auto &entry : perm)
7037 : {
7038 86118 : int this_in_layout_i = in_layout_i;
7039 86118 : if (this_in_layout_i < 0)
7040 : {
7041 59875 : slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
7042 59875 : unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
7043 59875 : if (in_partition_i == -1u)
7044 329 : continue;
7045 59546 : this_in_layout_i = m_partitions[in_partition_i].layout;
7046 : }
7047 85789 : if (this_in_layout_i > 0)
7048 19141 : entry.second = m_perms[this_in_layout_i][entry.second];
7049 : }
7050 30626 : if (out_layout_i > 0)
7051 7147 : vect_slp_permute (m_perms[out_layout_i], perm, true);
7052 30626 : }
7053 :
7054 : /* Check whether the target allows NODE to be rearranged so that the node's
7055 : output has layout OUT_LAYOUT_I. Return the cost of the change if so,
7056 : in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
7057 :
7058 : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
7059 : NODE can adapt to the layout changes that have (perhaps provisionally)
7060 : been chosen for NODE's children, so that no extra permutations are
7061 : needed on either the input or the output of NODE.
7062 :
7063 : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
7064 : that all inputs will be forced into layout IN_LAYOUT_I beforehand.
7065 :
7066 : IN_LAYOUT_I has no meaning for other types of node.
7067 :
7068 : Keeping the node as-is is always valid. If the target doesn't appear
7069 : to support the node as-is, but might realistically support other layouts,
7070 : then layout 0 instead has the cost of a worst-case permutation. On the
7071 : one hand, this ensures that every node has at least one valid layout,
7072 : avoiding what would otherwise be an awkward special case. On the other,
7073 : it still encourages the pass to change an invalid pre-existing layout
7074 : choice into a valid one. */
7075 :
7076 : int
7077 232627 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
7078 : unsigned int out_layout_i)
7079 : {
7080 232627 : const int fallback_cost = 1;
7081 :
7082 232627 : if (SLP_TREE_PERMUTE_P (node))
7083 : {
7084 25485 : auto_lane_permutation_t tmp_perm;
7085 25485 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
7086 :
7087 : /* Check that the child nodes support the chosen layout. Checking
7088 : the first child is enough, since any second child would have the
7089 : same shape. */
7090 25485 : auto first_child = SLP_TREE_CHILDREN (node)[0];
7091 25485 : if (in_layout_i > 0
7092 25485 : && !is_compatible_layout (first_child, in_layout_i))
7093 : return -1;
7094 :
7095 24926 : change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
7096 49852 : int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
7097 : node, tmp_perm,
7098 24926 : SLP_TREE_CHILDREN (node),
7099 : false);
7100 24926 : if (count < 0)
7101 : {
7102 1498 : if (in_layout_i == 0 && out_layout_i == 0)
7103 : {
7104 : /* Use the fallback cost if the node could in principle support
7105 : some nonzero layout for both the inputs and the outputs.
7106 : Otherwise assume that the node will be rejected later
7107 : and rebuilt from scalars. */
7108 363 : if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
7109 : return fallback_cost;
7110 293 : return 0;
7111 : }
7112 : return -1;
7113 : }
7114 :
7115 : /* We currently have no way of telling whether the new layout is cheaper
7116 : or more expensive than the old one. But at least in principle,
7117 : it should be worth making zero permutations (whole-vector shuffles)
7118 : cheaper than real permutations, in case the pass is able to remove
7119 : the latter. */
7120 23428 : return count == 0 ? 0 : 1;
7121 25485 : }
7122 :
7123 207142 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
7124 207142 : if (rep
7125 206211 : && STMT_VINFO_DATA_REF (rep)
7126 63722 : && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
7127 253580 : && SLP_TREE_LOAD_PERMUTATION (node).exists ())
7128 : {
7129 39406 : auto_load_permutation_t tmp_perm;
7130 39406 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
7131 39406 : if (out_layout_i > 0)
7132 13507 : vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
7133 :
7134 39406 : poly_uint64 vf = 1;
7135 39406 : if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
7136 12066 : vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7137 39406 : unsigned int n_perms;
7138 39406 : if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
7139 : nullptr, vf, true, false, &n_perms))
7140 : {
7141 1491 : auto rep = SLP_TREE_REPRESENTATIVE (node);
7142 1491 : if (out_layout_i == 0)
7143 : {
7144 : /* Use the fallback cost if the load is an N-to-N permutation.
7145 : Otherwise assume that the node will be rejected later
7146 : and rebuilt from scalars. */
7147 1088 : if (STMT_VINFO_GROUPED_ACCESS (rep)
7148 2176 : && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
7149 1088 : == SLP_TREE_LANES (node)))
7150 592 : return fallback_cost;
7151 : return 0;
7152 : }
7153 : return -1;
7154 : }
7155 :
7156 : /* See the comment above the corresponding VEC_PERM_EXPR handling. */
7157 37915 : return n_perms == 0 ? 0 : 1;
7158 39406 : }
7159 :
7160 : return 0;
7161 : }
7162 :
7163 : /* Decide which element layouts we should consider using. Calculate the
7164 : weights associated with inserting layout changes on partition edges.
7165 : Also mark partitions that cannot change layout, by setting their
7166 : layout to zero. */
7167 :
7168 : void
7169 676294 : vect_optimize_slp_pass::start_choosing_layouts ()
7170 : {
7171 : /* Used to assign unique permutation indices. */
7172 676294 : using perm_hash = unbounded_hashmap_traits<
7173 : vec_free_hash_base<int_hash_base<unsigned>>,
7174 : int_hash<int, -1, -2>
7175 : >;
7176 676294 : hash_map<vec<unsigned>, int, perm_hash> layout_ids;
7177 :
7178 : /* Layout 0 is "no change". */
7179 676294 : m_perms.safe_push (vNULL);
7180 :
7181 : /* Create layouts from existing permutations. */
7182 676294 : auto_load_permutation_t tmp_perm;
7183 5502017 : for (unsigned int node_i : m_partitioned_nodes)
7184 : {
7185 : /* Leafs also double as entries to the reverse graph. Allow the
7186 : layout of those to be changed. */
7187 3473135 : auto &vertex = m_vertices[node_i];
7188 3473135 : auto &partition = m_partitions[vertex.partition];
7189 3473135 : if (!m_slpg->vertices[node_i].succ)
7190 880167 : partition.layout = 0;
7191 :
7192 : /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
7193 3473135 : slp_tree node = vertex.node;
7194 3473135 : stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
7195 3473135 : slp_tree child;
7196 3473135 : unsigned HOST_WIDE_INT imin, imax = 0;
7197 3473135 : bool any_permute = false;
7198 3473135 : tmp_perm.truncate (0);
7199 3473135 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
7200 : {
7201 : /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
7202 : unpermuted, record a layout that reverses this permutation.
7203 :
7204 : We would need more work to cope with loads that are internally
7205 : permuted and also have inputs (such as masks for
7206 : IFN_MASK_LOADs). */
7207 592008 : gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
7208 592008 : if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
7209 : {
7210 420131 : partition.layout = -1;
7211 3455884 : continue;
7212 : }
7213 171877 : dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
7214 171877 : imin = DR_GROUP_SIZE (dr_stmt) + 1;
7215 171877 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
7216 : }
7217 5644173 : else if (SLP_TREE_PERMUTE_P (node)
7218 136434 : && SLP_TREE_CHILDREN (node).length () == 1
7219 118081 : && (child = SLP_TREE_CHILDREN (node)[0])
7220 2999208 : && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
7221 118081 : .is_constant (&imin)))
7222 : {
7223 : /* If the child has the same vector size as this node,
7224 : reversing the permutation can make the permutation a no-op.
7225 : In other cases it can change a true permutation into a
7226 : full-vector extract. */
7227 118081 : tmp_perm.reserve (SLP_TREE_LANES (node));
7228 316788 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7229 198707 : tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
7230 : }
7231 : else
7232 2763046 : continue;
7233 :
7234 765279 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7235 : {
7236 475321 : unsigned idx = tmp_perm[j];
7237 475321 : imin = MIN (imin, idx);
7238 475321 : imax = MAX (imax, idx);
7239 475321 : if (idx - tmp_perm[0] != j)
7240 139172 : any_permute = true;
7241 : }
7242 : /* If the span doesn't match we'd disrupt VF computation, avoid
7243 : that for now. */
7244 289958 : if (imax - imin + 1 != SLP_TREE_LANES (node))
7245 82689 : continue;
7246 : /* If there's no permute no need to split one out. In this case
7247 : we can consider turning a load into a permuted load, if that
7248 : turns out to be cheaper than alternatives. */
7249 207269 : if (!any_permute)
7250 : {
7251 189880 : partition.layout = -1;
7252 189880 : continue;
7253 : }
7254 :
7255 : /* For now only handle true permutes, like
7256 : vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
7257 : when permuting constants and invariants keeping the permute
7258 : bijective. */
7259 17389 : auto_sbitmap load_index (SLP_TREE_LANES (node));
7260 17389 : bitmap_clear (load_index);
7261 66381 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7262 48992 : bitmap_set_bit (load_index, tmp_perm[j] - imin);
7263 : unsigned j;
7264 65697 : for (j = 0; j < SLP_TREE_LANES (node); ++j)
7265 48446 : if (!bitmap_bit_p (load_index, j))
7266 : break;
7267 17389 : if (j != SLP_TREE_LANES (node))
7268 138 : continue;
7269 :
7270 17251 : vec<unsigned> perm = vNULL;
7271 17251 : perm.safe_grow (SLP_TREE_LANES (node), true);
7272 65458 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7273 48207 : perm[j] = tmp_perm[j] - imin;
7274 :
7275 34502 : if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
7276 : {
7277 : /* Continue to use existing layouts, but don't add any more. */
7278 0 : int *entry = layout_ids.get (perm);
7279 0 : partition.layout = entry ? *entry : 0;
7280 0 : perm.release ();
7281 : }
7282 : else
7283 : {
7284 17251 : bool existed;
7285 17251 : int &layout_i = layout_ids.get_or_insert (perm, &existed);
7286 17251 : if (existed)
7287 6225 : perm.release ();
7288 : else
7289 : {
7290 11026 : layout_i = m_perms.length ();
7291 11026 : m_perms.safe_push (perm);
7292 : }
7293 17251 : partition.layout = layout_i;
7294 : }
7295 17389 : }
7296 :
7297 : /* Initially assume that every layout is possible and has zero cost
7298 : in every partition. */
7299 676294 : m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
7300 1352588 : * m_perms.length ());
7301 :
7302 : /* We have to mark outgoing permutations facing non-associating-reduction
7303 : graph entries that are not represented as to be materialized.
7304 : slp_inst_kind_bb_reduc currently only covers associatable reductions. */
7305 3482533 : for (slp_instance instance : m_vinfo->slp_instances)
7306 1453651 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
7307 : {
7308 6320 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
7309 6320 : m_partitions[m_vertices[node_i].partition].layout = 0;
7310 : }
7311 1447331 : else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
7312 : {
7313 2255 : stmt_vec_info stmt_info
7314 2255 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
7315 2255 : vect_reduc_info reduc_info
7316 2255 : = info_for_reduction (as_a <loop_vec_info> (m_vinfo),
7317 : SLP_INSTANCE_TREE (instance));
7318 2255 : if (needs_fold_left_reduction_p (TREE_TYPE
7319 : (gimple_get_lhs (stmt_info->stmt)),
7320 : VECT_REDUC_INFO_CODE (reduc_info)))
7321 : {
7322 97 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
7323 97 : m_partitions[m_vertices[node_i].partition].layout = 0;
7324 : }
7325 : }
7326 :
7327 : /* Check which layouts each node and partition can handle. Calculate the
7328 : weights associated with inserting layout changes on edges. */
7329 5502017 : for (unsigned int node_i : m_partitioned_nodes)
7330 : {
7331 3473135 : auto &vertex = m_vertices[node_i];
7332 3473135 : auto &partition = m_partitions[vertex.partition];
7333 3473135 : slp_tree node = vertex.node;
7334 :
7335 3473135 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
7336 : {
7337 3467835 : vertex.weight = vect_slp_node_weight (node);
7338 :
7339 : /* We do not handle stores with a permutation, so all
7340 : incoming permutations must have been materialized.
7341 :
7342 : We also don't handle masked grouped loads, which lack a
7343 : permutation vector. In this case the memory locations
7344 : form an implicit second input to the loads, on top of the
7345 : explicit mask input, and the memory input's layout cannot
7346 : be changed.
7347 :
7348 : On the other hand, we do support permuting gather loads and
7349 : masked gather loads, where each scalar load is independent
7350 : of the others. This can be useful if the address/index input
7351 : benefits from permutation. */
7352 3467835 : if (STMT_VINFO_DATA_REF (rep)
7353 1748773 : && STMT_VINFO_GROUPED_ACCESS (rep)
7354 4553839 : && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
7355 914127 : partition.layout = 0;
7356 :
7357 : /* We cannot change the layout of an operation that is
7358 : not independent on lanes. Note this is an explicit
7359 : negative list since that's much shorter than the respective
7360 : positive one but it's critical to keep maintaining it. */
7361 3467835 : if (is_gimple_call (STMT_VINFO_STMT (rep)))
7362 31662 : switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
7363 : {
7364 1155 : case CFN_COMPLEX_ADD_ROT90:
7365 1155 : case CFN_COMPLEX_ADD_ROT270:
7366 1155 : case CFN_COMPLEX_MUL:
7367 1155 : case CFN_COMPLEX_MUL_CONJ:
7368 1155 : case CFN_VEC_ADDSUB:
7369 1155 : case CFN_VEC_FMADDSUB:
7370 1155 : case CFN_VEC_FMSUBADD:
7371 1155 : partition.layout = 0;
7372 : default:;
7373 : }
7374 : }
7375 :
7376 7838157 : auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
7377 : {
7378 4365022 : auto &other_vertex = m_vertices[other_node_i];
7379 :
7380 : /* Count the number of edges from earlier partitions and the number
7381 : of edges to later partitions. */
7382 4365022 : if (other_vertex.partition < vertex.partition)
7383 2182511 : partition.in_degree += 1;
7384 : else
7385 2182511 : partition.out_degree += 1;
7386 :
7387 : /* If the current node uses the result of OTHER_NODE_I, accumulate
7388 : the effects of that. */
7389 4365022 : if (ud->src == int (node_i))
7390 : {
7391 2182511 : other_vertex.out_weight += vertex.weight;
7392 2182511 : other_vertex.out_degree += 1;
7393 : }
7394 7838157 : };
7395 3473135 : for_each_partition_edge (node_i, process_edge);
7396 : }
7397 676294 : }
7398 :
7399 : /* Return the incoming costs for node NODE_I, assuming that each input keeps
7400 : its current (provisional) choice of layout. The inputs do not necessarily
7401 : have the same layout as each other. */
7402 :
7403 : slpg_layout_cost
7404 3180 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
7405 : {
7406 3180 : auto &vertex = m_vertices[node_i];
7407 3180 : slpg_layout_cost cost;
7408 11625 : auto add_cost = [&](graph_edge *, unsigned int other_node_i)
7409 : {
7410 8445 : auto &other_vertex = m_vertices[other_node_i];
7411 8445 : if (other_vertex.partition < vertex.partition)
7412 : {
7413 5352 : auto &other_partition = m_partitions[other_vertex.partition];
7414 10704 : auto &other_costs = partition_layout_costs (other_vertex.partition,
7415 5352 : other_partition.layout);
7416 5352 : slpg_layout_cost this_cost = other_costs.in_cost;
7417 5352 : this_cost.add_serial_cost (other_costs.internal_cost);
7418 5352 : this_cost.split (other_partition.out_degree);
7419 5352 : cost.add_parallel_cost (this_cost);
7420 : }
7421 11625 : };
7422 3180 : for_each_partition_edge (node_i, add_cost);
7423 3180 : return cost;
7424 : }
7425 :
7426 : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
7427 : and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
7428 : slpg_layout_cost::impossible () if the change isn't possible. */
7429 :
7430 : slpg_layout_cost
7431 753796 : vect_optimize_slp_pass::
7432 : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
7433 : unsigned int layout2_i)
7434 : {
7435 753796 : auto &def_vertex = m_vertices[ud->dest];
7436 753796 : auto &use_vertex = m_vertices[ud->src];
7437 753796 : auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
7438 753796 : auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
7439 753796 : auto factor = change_layout_cost (def_vertex.node, def_layout_i,
7440 : use_layout_i);
7441 753796 : if (factor < 0)
7442 4713 : return slpg_layout_cost::impossible ();
7443 :
7444 : /* We have a choice of putting the layout change at the site of the
7445 : definition or at the site of the use. Prefer the former when
7446 : optimizing for size or when the execution frequency of the
7447 : definition is no greater than the combined execution frequencies of
7448 : the uses. When putting the layout change at the site of the definition,
7449 : divvy up the cost among all consumers. */
7450 749083 : if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
7451 : {
7452 733027 : slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
7453 733027 : cost.split (def_vertex.out_degree);
7454 733027 : return cost;
7455 : }
7456 16056 : return { use_vertex.weight * factor, m_optimize_size };
7457 : }
7458 :
7459 : /* UD represents a use-def link between FROM_NODE_I and a node in a later
7460 : partition; FROM_NODE_I could be the definition node or the use node.
7461 : The node at the other end of the link wants to use layout TO_LAYOUT_I.
7462 : Return the cost of any necessary fix-ups on edge UD, or return
7463 : slpg_layout_cost::impossible () if the change isn't possible.
7464 :
7465 : At this point, FROM_NODE_I's partition has chosen the cheapest
7466 : layout based on the information available so far, but this choice
7467 : is only provisional. */
7468 :
7469 : slpg_layout_cost
7470 198373 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
7471 : unsigned int to_layout_i)
7472 : {
7473 198373 : auto &from_vertex = m_vertices[from_node_i];
7474 198373 : unsigned int from_partition_i = from_vertex.partition;
7475 198373 : slpg_partition_info &from_partition = m_partitions[from_partition_i];
7476 198373 : gcc_assert (from_partition.layout >= 0);
7477 :
7478 : /* First calculate the cost on the assumption that FROM_PARTITION sticks
7479 : with its current layout preference. */
7480 198373 : slpg_layout_cost cost = slpg_layout_cost::impossible ();
7481 198373 : auto edge_cost = edge_layout_cost (ud, from_node_i,
7482 198373 : from_partition.layout, to_layout_i);
7483 198373 : if (edge_cost.is_possible ())
7484 : {
7485 391820 : auto &from_costs = partition_layout_costs (from_partition_i,
7486 195910 : from_partition.layout);
7487 195910 : cost = from_costs.in_cost;
7488 195910 : cost.add_serial_cost (from_costs.internal_cost);
7489 195910 : cost.split (from_partition.out_degree);
7490 195910 : cost.add_serial_cost (edge_cost);
7491 : }
7492 2463 : else if (from_partition.layout == 0)
7493 : /* We must allow the source partition to have layout 0 as a fallback,
7494 : in case all other options turn out to be impossible. */
7495 2463 : return cost;
7496 :
7497 : /* Take the minimum of that cost and the cost that applies if
7498 : FROM_PARTITION instead switches to TO_LAYOUT_I. */
7499 195910 : auto &direct_layout_costs = partition_layout_costs (from_partition_i,
7500 : to_layout_i);
7501 195910 : if (direct_layout_costs.is_possible ())
7502 : {
7503 176167 : slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
7504 176167 : direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
7505 176167 : direct_cost.split (from_partition.out_degree);
7506 176167 : if (!cost.is_possible ()
7507 176167 : || direct_cost.is_better_than (cost, m_optimize_size))
7508 44996 : cost = direct_cost;
7509 : }
7510 :
7511 195910 : return cost;
7512 : }
7513 :
7514 : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
7515 : partition; TO_NODE_I could be the definition node or the use node.
7516 : The node at the other end of the link wants to use layout FROM_LAYOUT_I;
7517 : return the cost of any necessary fix-ups on edge UD, or
7518 : slpg_layout_cost::impossible () if the choice cannot be made.
7519 :
7520 : At this point, TO_NODE_I's partition has a fixed choice of layout. */
7521 :
7522 : slpg_layout_cost
7523 182752 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
7524 : unsigned int from_layout_i)
7525 : {
7526 182752 : auto &to_vertex = m_vertices[to_node_i];
7527 182752 : unsigned int to_partition_i = to_vertex.partition;
7528 182752 : slpg_partition_info &to_partition = m_partitions[to_partition_i];
7529 182752 : gcc_assert (to_partition.layout >= 0);
7530 :
7531 : /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
7532 : adjusted for this input having layout FROM_LAYOUT_I. Assume that
7533 : any other inputs keep their current choice of layout. */
7534 182752 : auto &to_costs = partition_layout_costs (to_partition_i,
7535 : to_partition.layout);
7536 182752 : if (ud->src == int (to_node_i)
7537 182598 : && SLP_TREE_PERMUTE_P (to_vertex.node))
7538 : {
7539 9498 : auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
7540 9498 : auto old_layout = from_partition.layout;
7541 9498 : from_partition.layout = from_layout_i;
7542 18996 : int factor = internal_node_cost (to_vertex.node, -1,
7543 9498 : to_partition.layout);
7544 9498 : from_partition.layout = old_layout;
7545 9498 : if (factor >= 0)
7546 : {
7547 8872 : slpg_layout_cost cost = to_costs.out_cost;
7548 17744 : cost.add_serial_cost ({ to_vertex.weight * factor,
7549 8872 : m_optimize_size });
7550 8872 : cost.split (to_partition.in_degree);
7551 8872 : return cost;
7552 : }
7553 : }
7554 :
7555 : /* Compute the cost if we insert any necessary layout change on edge UD. */
7556 173880 : auto edge_cost = edge_layout_cost (ud, to_node_i,
7557 173880 : to_partition.layout, from_layout_i);
7558 173880 : if (edge_cost.is_possible ())
7559 : {
7560 173880 : slpg_layout_cost cost = to_costs.out_cost;
7561 173880 : cost.add_serial_cost (to_costs.internal_cost);
7562 173880 : cost.split (to_partition.in_degree);
7563 173880 : cost.add_serial_cost (edge_cost);
7564 173880 : return cost;
7565 : }
7566 :
7567 0 : return slpg_layout_cost::impossible ();
7568 : }
7569 :
7570 : /* Make a forward pass through the partitions, accumulating input costs.
7571 : Make a tentative (provisional) choice of layout for each partition,
7572 : ensuring that this choice still allows later partitions to keep
7573 : their original layout. */
7574 :
7575 : void
7576 5654 : vect_optimize_slp_pass::forward_pass ()
7577 : {
7578 124920 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
7579 : ++partition_i)
7580 : {
7581 119266 : auto &partition = m_partitions[partition_i];
7582 :
7583 : /* If the partition consists of a single VEC_PERM_EXPR, precompute
7584 : the incoming cost that would apply if every predecessor partition
7585 : keeps its current layout. This is used within the loop below. */
7586 119266 : slpg_layout_cost in_cost;
7587 119266 : slp_tree single_node = nullptr;
7588 119266 : if (partition.node_end == partition.node_begin + 1)
7589 : {
7590 113161 : unsigned int node_i = m_partitioned_nodes[partition.node_begin];
7591 113161 : single_node = m_vertices[node_i].node;
7592 113161 : if (SLP_TREE_PERMUTE_P (single_node))
7593 3180 : in_cost = total_in_cost (node_i);
7594 : }
7595 :
7596 : /* Go through the possible layouts. Decide which ones are valid
7597 : for this partition and record which of the valid layouts has
7598 : the lowest cost. */
7599 119266 : unsigned int min_layout_i = 0;
7600 119266 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7601 363651 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7602 : {
7603 244385 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7604 244385 : if (!layout_costs.is_possible ())
7605 55522 : continue;
7606 :
7607 : /* If the recorded layout is already 0 then the layout cannot
7608 : change. */
7609 244385 : if (partition.layout == 0 && layout_i != 0)
7610 : {
7611 38801 : layout_costs.mark_impossible ();
7612 38801 : continue;
7613 : }
7614 :
7615 205584 : bool is_possible = true;
7616 422155 : for (unsigned int order_i = partition.node_begin;
7617 422155 : order_i < partition.node_end; ++order_i)
7618 : {
7619 231217 : unsigned int node_i = m_partitioned_nodes[order_i];
7620 231217 : auto &vertex = m_vertices[node_i];
7621 :
7622 : /* Reject the layout if it is individually incompatible
7623 : with any node in the partition. */
7624 231217 : if (!is_compatible_layout (vertex.node, layout_i))
7625 : {
7626 13614 : is_possible = false;
7627 14646 : break;
7628 : }
7629 :
7630 601734 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7631 : {
7632 384131 : auto &other_vertex = m_vertices[other_node_i];
7633 384131 : if (other_vertex.partition < vertex.partition)
7634 : {
7635 : /* Accumulate the incoming costs from earlier
7636 : partitions, plus the cost of any layout changes
7637 : on UD itself. */
7638 198373 : auto cost = forward_cost (ud, other_node_i, layout_i);
7639 198373 : if (!cost.is_possible ())
7640 2463 : is_possible = false;
7641 : else
7642 195910 : layout_costs.in_cost.add_parallel_cost (cost);
7643 : }
7644 : else
7645 : /* Reject the layout if it would make layout 0 impossible
7646 : for later partitions. This amounts to testing that the
7647 : target supports reversing the layout change on edges
7648 : to later partitions.
7649 :
7650 : In principle, it might be possible to push a layout
7651 : change all the way down a graph, so that it never
7652 : needs to be reversed and so that the target doesn't
7653 : need to support the reverse operation. But it would
7654 : be awkward to bail out if we hit a partition that
7655 : does not support the new layout, especially since
7656 : we are not dealing with a lattice. */
7657 185758 : is_possible &= edge_layout_cost (ud, other_node_i, 0,
7658 185758 : layout_i).is_possible ();
7659 601734 : };
7660 217603 : for_each_partition_edge (node_i, add_cost);
7661 :
7662 : /* Accumulate the cost of using LAYOUT_I within NODE,
7663 : both for the inputs and the outputs. */
7664 217603 : int factor = internal_node_cost (vertex.node, layout_i,
7665 : layout_i);
7666 217603 : if (factor < 0)
7667 : {
7668 1032 : is_possible = false;
7669 1032 : break;
7670 : }
7671 216571 : else if (factor)
7672 36093 : layout_costs.internal_cost.add_serial_cost
7673 36093 : ({ vertex.weight * factor, m_optimize_size });
7674 : }
7675 205584 : if (!is_possible)
7676 : {
7677 16721 : layout_costs.mark_impossible ();
7678 16721 : continue;
7679 : }
7680 :
7681 : /* Combine the incoming and partition-internal costs. */
7682 188863 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7683 188863 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7684 :
7685 : /* If this partition consists of a single VEC_PERM_EXPR, see
7686 : if the VEC_PERM_EXPR can be changed to support output layout
7687 : LAYOUT_I while keeping all the provisional choices of input
7688 : layout. */
7689 188863 : if (single_node && SLP_TREE_PERMUTE_P (single_node))
7690 : {
7691 5526 : int factor = internal_node_cost (single_node, -1, layout_i);
7692 5526 : if (factor >= 0)
7693 : {
7694 5087 : auto weight = m_vertices[single_node->vertex].weight;
7695 5087 : slpg_layout_cost internal_cost
7696 5087 : = { weight * factor, m_optimize_size };
7697 :
7698 5087 : slpg_layout_cost alt_cost = in_cost;
7699 5087 : alt_cost.add_serial_cost (internal_cost);
7700 5087 : if (alt_cost.is_better_than (combined_cost, m_optimize_size))
7701 : {
7702 1602 : combined_cost = alt_cost;
7703 1602 : layout_costs.in_cost = in_cost;
7704 1602 : layout_costs.internal_cost = internal_cost;
7705 : }
7706 : }
7707 : }
7708 :
7709 : /* Record the layout with the lowest cost. Prefer layout 0 in
7710 : the event of a tie between it and another layout. */
7711 188863 : if (!min_layout_cost.is_possible ()
7712 69597 : || combined_cost.is_better_than (min_layout_cost,
7713 69597 : m_optimize_size))
7714 : {
7715 133891 : min_layout_i = layout_i;
7716 133891 : min_layout_cost = combined_cost;
7717 : }
7718 : }
7719 :
7720 : /* This loop's handling of earlier partitions should ensure that
7721 : choosing the original layout for the current partition is no
7722 : less valid than it was in the original graph, even with the
7723 : provisional layout choices for those earlier partitions. */
7724 119266 : gcc_assert (min_layout_cost.is_possible ());
7725 119266 : partition.layout = min_layout_i;
7726 : }
7727 5654 : }
7728 :
7729 : /* Make a backward pass through the partitions, accumulating output costs.
7730 : Make a final choice of layout for each partition. */
7731 :
7732 : void
7733 5654 : vect_optimize_slp_pass::backward_pass ()
7734 : {
7735 130574 : for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
7736 : {
7737 119266 : auto &partition = m_partitions[partition_i];
7738 :
7739 119266 : unsigned int min_layout_i = 0;
7740 119266 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7741 363651 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7742 : {
7743 244385 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7744 244385 : if (!layout_costs.is_possible ())
7745 55522 : continue;
7746 :
7747 : /* Accumulate the costs from successor partitions. */
7748 188863 : bool is_possible = true;
7749 403328 : for (unsigned int order_i = partition.node_begin;
7750 403328 : order_i < partition.node_end; ++order_i)
7751 : {
7752 214465 : unsigned int node_i = m_partitioned_nodes[order_i];
7753 214465 : auto &vertex = m_vertices[node_i];
7754 593002 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7755 : {
7756 378537 : auto &other_vertex = m_vertices[other_node_i];
7757 378537 : auto &other_partition = m_partitions[other_vertex.partition];
7758 378537 : if (other_vertex.partition > vertex.partition)
7759 : {
7760 : /* Accumulate the incoming costs from later
7761 : partitions, plus the cost of any layout changes
7762 : on UD itself. */
7763 182752 : auto cost = backward_cost (ud, other_node_i, layout_i);
7764 182752 : if (!cost.is_possible ())
7765 0 : is_possible = false;
7766 : else
7767 182752 : layout_costs.out_cost.add_parallel_cost (cost);
7768 : }
7769 : else
7770 : /* Make sure that earlier partitions can (if necessary
7771 : or beneficial) keep the layout that they chose in
7772 : the forward pass. This ensures that there is at
7773 : least one valid choice of layout. */
7774 195785 : is_possible &= edge_layout_cost (ud, other_node_i,
7775 195785 : other_partition.layout,
7776 195785 : layout_i).is_possible ();
7777 593002 : };
7778 214465 : for_each_partition_edge (node_i, add_cost);
7779 : }
7780 188863 : if (!is_possible)
7781 : {
7782 0 : layout_costs.mark_impossible ();
7783 0 : continue;
7784 : }
7785 :
7786 : /* Locally combine the costs from the forward and backward passes.
7787 : (This combined cost is not passed on, since that would lead
7788 : to double counting.) */
7789 188863 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7790 188863 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7791 188863 : combined_cost.add_serial_cost (layout_costs.out_cost);
7792 :
7793 : /* Record the layout with the lowest cost. Prefer layout 0 in
7794 : the event of a tie between it and another layout. */
7795 188863 : if (!min_layout_cost.is_possible ()
7796 69597 : || combined_cost.is_better_than (min_layout_cost,
7797 69597 : m_optimize_size))
7798 : {
7799 127263 : min_layout_i = layout_i;
7800 127263 : min_layout_cost = combined_cost;
7801 : }
7802 : }
7803 :
7804 119266 : gcc_assert (min_layout_cost.is_possible ());
7805 119266 : partition.layout = min_layout_i;
7806 : }
7807 5654 : }
7808 :
7809 : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
7810 : NODE already has the layout that was selected for its partition. */
7811 :
7812 : slp_tree
7813 165612 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
7814 : unsigned int to_layout_i)
7815 : {
7816 165612 : unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
7817 165612 : slp_tree result = m_node_layouts[result_i];
7818 165612 : if (result)
7819 : return result;
7820 :
7821 165146 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7822 165146 : || (SLP_TREE_DEF_TYPE (node) == vect_external_def
7823 : /* We can't permute vector defs in place. */
7824 20220 : && SLP_TREE_VEC_DEFS (node).is_empty ()))
7825 : {
7826 : /* If the vector is uniform or unchanged, there's nothing to do. */
7827 37931 : if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
7828 : result = node;
7829 : else
7830 : {
7831 1982 : auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
7832 1982 : result = vect_create_new_slp_node (scalar_ops);
7833 1982 : vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
7834 : }
7835 : }
7836 : else
7837 : {
7838 127215 : unsigned int partition_i = m_vertices[node->vertex].partition;
7839 127215 : unsigned int from_layout_i = m_partitions[partition_i].layout;
7840 127215 : if (from_layout_i == to_layout_i)
7841 126672 : return node;
7842 :
7843 : /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
7844 : permutation instead of a serial one. Leave the new permutation
7845 : in TMP_PERM on success. */
7846 543 : auto_lane_permutation_t tmp_perm;
7847 543 : unsigned int num_inputs = 1;
7848 543 : if (SLP_TREE_PERMUTE_P (node))
7849 : {
7850 7 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
7851 7 : if (from_layout_i != 0)
7852 7 : vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
7853 7 : if (to_layout_i != 0)
7854 4 : vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
7855 7 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7856 : tmp_perm,
7857 7 : SLP_TREE_CHILDREN (node),
7858 : false) >= 0)
7859 7 : num_inputs = SLP_TREE_CHILDREN (node).length ();
7860 : else
7861 0 : tmp_perm.truncate (0);
7862 : }
7863 :
7864 543 : if (dump_enabled_p ())
7865 : {
7866 68 : if (tmp_perm.length () > 0)
7867 6 : dump_printf_loc (MSG_NOTE, vect_location,
7868 : "duplicating permutation node %p with"
7869 : " layout %d\n",
7870 : (void *) node, to_layout_i);
7871 : else
7872 62 : dump_printf_loc (MSG_NOTE, vect_location,
7873 : "inserting permutation node in place of %p\n",
7874 : (void *) node);
7875 : }
7876 :
7877 543 : unsigned int num_lanes = SLP_TREE_LANES (node);
7878 543 : result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
7879 543 : if (SLP_TREE_SCALAR_STMTS (node).length ())
7880 : {
7881 542 : auto &stmts = SLP_TREE_SCALAR_STMTS (result);
7882 542 : stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
7883 542 : if (from_layout_i != 0)
7884 272 : vect_slp_permute (m_perms[from_layout_i], stmts, false);
7885 542 : if (to_layout_i != 0)
7886 274 : vect_slp_permute (m_perms[to_layout_i], stmts, true);
7887 : }
7888 543 : SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
7889 543 : SLP_TREE_LANES (result) = num_lanes;
7890 543 : SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
7891 543 : result->vertex = -1;
7892 :
7893 543 : auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
7894 543 : if (tmp_perm.length ())
7895 : {
7896 7 : lane_perm.safe_splice (tmp_perm);
7897 7 : SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
7898 : }
7899 : else
7900 : {
7901 536 : lane_perm.create (num_lanes);
7902 1672 : for (unsigned j = 0; j < num_lanes; ++j)
7903 1136 : lane_perm.quick_push ({ 0, j });
7904 536 : if (from_layout_i != 0)
7905 265 : vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
7906 536 : if (to_layout_i != 0)
7907 271 : vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
7908 536 : SLP_TREE_CHILDREN (result).safe_push (node);
7909 : }
7910 2176 : for (slp_tree child : SLP_TREE_CHILDREN (result))
7911 547 : child->refcnt++;
7912 543 : }
7913 38474 : m_node_layouts[result_i] = result;
7914 38474 : return result;
7915 : }
7916 :
7917 : /* Apply the chosen vector layouts to the SLP graph. */
7918 :
7919 : void
7920 10609 : vect_optimize_slp_pass::materialize ()
7921 : {
7922 : /* We no longer need the costs, so avoid having two O(N * P) arrays
7923 : live at the same time. */
7924 10609 : m_partition_layout_costs.release ();
7925 31827 : m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
7926 :
7927 21218 : auto_sbitmap fully_folded (m_vertices.length ());
7928 10609 : bitmap_clear (fully_folded);
7929 173634 : for (unsigned int node_i : m_partitioned_nodes)
7930 : {
7931 141807 : auto &vertex = m_vertices[node_i];
7932 141807 : slp_tree node = vertex.node;
7933 141807 : int layout_i = m_partitions[vertex.partition].layout;
7934 141807 : gcc_assert (layout_i >= 0);
7935 :
7936 : /* Rearrange the scalar statements to match the chosen layout. */
7937 141807 : if (layout_i > 0)
7938 15897 : vect_slp_permute (m_perms[layout_i],
7939 15897 : SLP_TREE_SCALAR_STMTS (node), true);
7940 :
7941 : /* Update load and lane permutations. */
7942 141807 : if (SLP_TREE_PERMUTE_P (node))
7943 : {
7944 : /* First try to absorb the input vector layouts. If that fails,
7945 : force the inputs to have layout LAYOUT_I too. We checked that
7946 : that was possible before deciding to use nonzero output layouts.
7947 : (Note that at this stage we don't really have any guarantee that
7948 : the target supports the original VEC_PERM_EXPR.) */
7949 5337 : auto &perm = SLP_TREE_LANE_PERMUTATION (node);
7950 5337 : auto_lane_permutation_t tmp_perm;
7951 5337 : tmp_perm.safe_splice (perm);
7952 5337 : change_vec_perm_layout (node, tmp_perm, -1, layout_i);
7953 5337 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7954 : tmp_perm,
7955 5337 : SLP_TREE_CHILDREN (node),
7956 : false) >= 0)
7957 : {
7958 4974 : if (dump_enabled_p ()
7959 5894 : && !std::equal (tmp_perm.begin (), tmp_perm.end (),
7960 : perm.begin ()))
7961 58 : dump_printf_loc (MSG_NOTE, vect_location,
7962 : "absorbing input layouts into %p\n",
7963 : (void *) node);
7964 28019 : std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
7965 4974 : bitmap_set_bit (fully_folded, node_i);
7966 : }
7967 : else
7968 : {
7969 : /* Not MSG_MISSED because it would make no sense to users. */
7970 363 : if (dump_enabled_p ())
7971 46 : dump_printf_loc (MSG_NOTE, vect_location,
7972 : "failed to absorb input layouts into %p\n",
7973 : (void *) node);
7974 363 : change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
7975 : }
7976 5337 : }
7977 : else
7978 : {
7979 136470 : gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
7980 136470 : auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
7981 136470 : if (layout_i > 0)
7982 : /* ??? When we handle non-bijective permutes the idea
7983 : is that we can force the load-permutation to be
7984 : { min, min + 1, min + 2, ... max }. But then the
7985 : scalar defs might no longer match the lane content
7986 : which means wrong-code with live lane vectorization.
7987 : So we possibly have to have NULL entries for those. */
7988 15794 : vect_slp_permute (m_perms[layout_i], load_perm, true);
7989 : }
7990 : }
7991 :
7992 : /* Do this before any nodes disappear, since it involves a walk
7993 : over the leaves. */
7994 10609 : remove_redundant_permutations ();
7995 :
7996 : /* Replace each child with a correctly laid-out version. */
7997 173634 : for (unsigned int node_i : m_partitioned_nodes)
7998 : {
7999 : /* Skip nodes that have already been handled above. */
8000 141807 : if (bitmap_bit_p (fully_folded, node_i))
8001 4974 : continue;
8002 :
8003 136833 : auto &vertex = m_vertices[node_i];
8004 136833 : int in_layout_i = m_partitions[vertex.partition].layout;
8005 136833 : gcc_assert (in_layout_i >= 0);
8006 :
8007 : unsigned j;
8008 : slp_tree child;
8009 410963 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
8010 : {
8011 171570 : if (!child)
8012 5958 : continue;
8013 :
8014 165612 : slp_tree new_child = get_result_with_layout (child, in_layout_i);
8015 165612 : if (new_child != child)
8016 : {
8017 2734 : vect_free_slp_tree (child);
8018 2734 : SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
8019 2734 : new_child->refcnt += 1;
8020 : }
8021 : }
8022 : }
8023 10609 : }
8024 :
8025 : /* Elide load permutations that are not necessary. Such permutations might
8026 : be pre-existing, rather than created by the layout optimizations. */
8027 :
8028 : void
8029 676294 : vect_optimize_slp_pass::remove_redundant_permutations ()
8030 : {
8031 4468138 : for (unsigned int node_i : m_leafs)
8032 : {
8033 2439256 : slp_tree node = m_vertices[node_i].node;
8034 2439256 : if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
8035 1847248 : continue;
8036 :
8037 : /* In basic block vectorization we allow any subchain of an interleaving
8038 : chain.
8039 : FORNOW: not in loop SLP because of realignment complications. */
8040 592008 : if (is_a <bb_vec_info> (m_vinfo))
8041 : {
8042 157690 : bool subchain_p = true;
8043 : stmt_vec_info next_load_info = NULL;
8044 : stmt_vec_info load_info;
8045 : unsigned j;
8046 157690 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
8047 : {
8048 128449 : if (j != 0
8049 128449 : && (next_load_info != load_info
8050 61091 : || ! load_info
8051 61091 : || DR_GROUP_GAP (load_info) != 1))
8052 : {
8053 : subchain_p = false;
8054 : break;
8055 : }
8056 105821 : next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
8057 : }
8058 51869 : if (subchain_p)
8059 : {
8060 29241 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8061 29241 : continue;
8062 : }
8063 : }
8064 : else
8065 : {
8066 540139 : loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
8067 540139 : bool this_load_permuted = !vect_load_perm_consecutive_p (node, 0);
8068 : /* When this isn't a grouped access we know it's single element
8069 : and contiguous. */
8070 540139 : if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
8071 : {
8072 420131 : if (!this_load_permuted
8073 420131 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
8074 419336 : || SLP_TREE_LANES (node) == 1))
8075 419338 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8076 420131 : continue;
8077 : }
8078 120008 : stmt_vec_info first_stmt_info
8079 120008 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
8080 120513 : if (!this_load_permuted
8081 : /* The load requires permutation when unrolling exposes
8082 : a gap either because the group is larger than the SLP
8083 : group-size or because there is a gap between the groups. */
8084 120008 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
8085 97981 : || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
8086 140 : && DR_GROUP_GAP (first_stmt_info) == 0)))
8087 : {
8088 505 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8089 505 : continue;
8090 : }
8091 : }
8092 : }
8093 676294 : }
8094 :
8095 : /* Print the partition graph and layout information to the dump file. */
8096 :
8097 : void
8098 674 : vect_optimize_slp_pass::dump ()
8099 : {
8100 674 : dump_printf_loc (MSG_NOTE, vect_location,
8101 : "SLP optimize permutations:\n");
8102 1361 : for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
8103 : {
8104 687 : dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
8105 687 : const char *sep = "";
8106 5866 : for (unsigned int idx : m_perms[layout_i])
8107 : {
8108 3805 : dump_printf (MSG_NOTE, "%s%d", sep, idx);
8109 3805 : sep = ", ";
8110 : }
8111 687 : dump_printf (MSG_NOTE, " }\n");
8112 : }
8113 674 : dump_printf_loc (MSG_NOTE, vect_location,
8114 : "SLP optimize partitions:\n");
8115 5612 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
8116 : ++partition_i)
8117 : {
8118 4938 : auto &partition = m_partitions[partition_i];
8119 4938 : dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
8120 4938 : dump_printf_loc (MSG_NOTE, vect_location,
8121 : " partition %d (layout %d):\n",
8122 : partition_i, partition.layout);
8123 4938 : dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
8124 10111 : for (unsigned int order_i = partition.node_begin;
8125 10111 : order_i < partition.node_end; ++order_i)
8126 : {
8127 5173 : auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
8128 10346 : dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
8129 5173 : (void *) vertex.node);
8130 5173 : dump_printf_loc (MSG_NOTE, vect_location,
8131 : " weight: %f\n",
8132 : vertex.weight.to_double ());
8133 5173 : if (vertex.out_degree)
8134 4050 : dump_printf_loc (MSG_NOTE, vect_location,
8135 : " out weight: %f (degree %d)\n",
8136 : vertex.out_weight.to_double (),
8137 : vertex.out_degree);
8138 5173 : if (SLP_TREE_PERMUTE_P (vertex.node))
8139 506 : dump_printf_loc (MSG_NOTE, vect_location,
8140 : " op: VEC_PERM_EXPR\n");
8141 4667 : else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
8142 4649 : dump_printf_loc (MSG_NOTE, vect_location,
8143 : " op template: %G", rep->stmt);
8144 : }
8145 4938 : dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
8146 10111 : for (unsigned int order_i = partition.node_begin;
8147 10111 : order_i < partition.node_end; ++order_i)
8148 : {
8149 5173 : unsigned int node_i = m_partitioned_nodes[order_i];
8150 5173 : auto &vertex = m_vertices[node_i];
8151 15617 : auto print_edge = [&](graph_edge *, unsigned int other_node_i)
8152 : {
8153 10444 : auto &other_vertex = m_vertices[other_node_i];
8154 10444 : if (other_vertex.partition < vertex.partition)
8155 5222 : dump_printf_loc (MSG_NOTE, vect_location,
8156 : " - %p [%d] --> %p\n",
8157 5222 : (void *) other_vertex.node,
8158 : other_vertex.partition,
8159 5222 : (void *) vertex.node);
8160 : else
8161 5222 : dump_printf_loc (MSG_NOTE, vect_location,
8162 : " - %p --> [%d] %p\n",
8163 5222 : (void *) vertex.node,
8164 : other_vertex.partition,
8165 5222 : (void *) other_vertex.node);
8166 15617 : };
8167 5173 : for_each_partition_edge (node_i, print_edge);
8168 : }
8169 :
8170 15013 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
8171 : {
8172 10075 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
8173 10075 : if (layout_costs.is_possible ())
8174 : {
8175 8301 : dump_printf_loc (MSG_NOTE, vect_location,
8176 : " layout %d:%s\n", layout_i,
8177 8301 : partition.layout == int (layout_i)
8178 : ? " (*)" : "");
8179 8301 : slpg_layout_cost combined_cost = layout_costs.in_cost;
8180 8301 : combined_cost.add_serial_cost (layout_costs.internal_cost);
8181 8301 : combined_cost.add_serial_cost (layout_costs.out_cost);
8182 : #define TEMPLATE "{depth: %f, total: %f}"
8183 8301 : dump_printf_loc (MSG_NOTE, vect_location,
8184 : " " TEMPLATE "\n",
8185 : layout_costs.in_cost.depth.to_double (),
8186 : layout_costs.in_cost.total.to_double ());
8187 8301 : dump_printf_loc (MSG_NOTE, vect_location,
8188 : " + " TEMPLATE "\n",
8189 : layout_costs.internal_cost.depth.to_double (),
8190 : layout_costs.internal_cost.total.to_double ());
8191 8301 : dump_printf_loc (MSG_NOTE, vect_location,
8192 : " + " TEMPLATE "\n",
8193 : layout_costs.out_cost.depth.to_double (),
8194 : layout_costs.out_cost.total.to_double ());
8195 8301 : dump_printf_loc (MSG_NOTE, vect_location,
8196 : " = " TEMPLATE "\n",
8197 : combined_cost.depth.to_double (),
8198 : combined_cost.total.to_double ());
8199 : #undef TEMPLATE
8200 : }
8201 : else
8202 1774 : dump_printf_loc (MSG_NOTE, vect_location,
8203 : " layout %d: rejected\n", layout_i);
8204 : }
8205 : }
8206 674 : }
8207 :
8208 : /* Masked load lanes discovery. */
8209 :
8210 : void
8211 676294 : vect_optimize_slp_pass::decide_masked_load_lanes ()
8212 : {
8213 6971416 : for (auto v : m_vertices)
8214 : {
8215 4942534 : slp_tree node = v.node;
8216 4942534 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
8217 3471447 : || SLP_TREE_PERMUTE_P (node))
8218 1608062 : continue;
8219 3334472 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8220 1629831 : if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
8221 : /* The mask has to be uniform. */
8222 967517 : || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
8223 967386 : || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
8224 3334557 : || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
8225 : IFN_MASK_LOAD))
8226 3334439 : continue;
8227 33 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8228 66 : if (STMT_VINFO_STRIDED_P (stmt_info)
8229 33 : || compare_step_with_zero (m_vinfo, stmt_info) <= 0
8230 63 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
8231 30 : DR_GROUP_SIZE (stmt_info),
8232 : true) == IFN_LAST)
8233 33 : continue;
8234 :
8235 : /* Uniform masks need to be suitably represented. */
8236 0 : slp_tree mask = SLP_TREE_CHILDREN (node)[0];
8237 0 : if (!SLP_TREE_PERMUTE_P (mask)
8238 0 : || SLP_TREE_CHILDREN (mask).length () != 1)
8239 0 : continue;
8240 0 : bool match = true;
8241 0 : for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
8242 0 : if (perm.first != 0 || perm.second != 0)
8243 : {
8244 : match = false;
8245 : break;
8246 : }
8247 0 : if (!match)
8248 0 : continue;
8249 :
8250 : /* Now see if the consumer side matches. */
8251 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
8252 0 : pred; pred = pred->pred_next)
8253 : {
8254 0 : slp_tree pred_node = m_vertices[pred->src].node;
8255 : /* All consumers should be a permute with a single outgoing lane. */
8256 0 : if (!SLP_TREE_PERMUTE_P (pred_node)
8257 0 : || SLP_TREE_LANES (pred_node) != 1)
8258 : {
8259 : match = false;
8260 : break;
8261 : }
8262 0 : gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
8263 : }
8264 0 : if (!match)
8265 0 : continue;
8266 : /* Now we can mark the nodes as to use load lanes. */
8267 0 : node->ldst_lanes = true;
8268 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
8269 0 : pred; pred = pred->pred_next)
8270 0 : m_vertices[pred->src].node->ldst_lanes = true;
8271 : /* The catch is we have to massage the mask. We have arranged
8272 : analyzed uniform masks to be represented by a splat VEC_PERM
8273 : which we can now simply elide as we cannot easily re-do SLP
8274 : discovery here. */
8275 0 : slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
8276 0 : SLP_TREE_REF_COUNT (new_mask)++;
8277 0 : SLP_TREE_CHILDREN (node)[0] = new_mask;
8278 0 : vect_free_slp_tree (mask);
8279 : }
8280 676294 : }
8281 :
8282 : /* Perform legitimizing attempts. This is intended to improve the
8283 : situation when layout 0 is not valid which is a situation the cost
8284 : based propagation does not handle well.
8285 : Return true if further layout optimization is possible, false if
8286 : the layout configuration should be considered final. */
8287 :
8288 : bool
8289 10609 : vect_optimize_slp_pass::legitimize ()
8290 : {
8291 : /* Perform a very simple legitimizing attempt by attempting to choose
8292 : a single layout for all partitions that will make all permutations
8293 : a noop. That should also be the optimal layout choice in case
8294 : layout zero is legitimate.
8295 : ??? Disconnected components of the SLP graph could have distinct
8296 : single layouts. */
8297 10609 : int single_layout_i = -1;
8298 10609 : unsigned deferred_up_to = -1U;
8299 31402 : for (unsigned partition_i = 0; partition_i < m_partitions.length ();
8300 : ++partition_i)
8301 : {
8302 26441 : auto &partition = m_partitions[partition_i];
8303 26441 : if (single_layout_i == -1)
8304 : {
8305 13851 : single_layout_i = partition.layout;
8306 13851 : deferred_up_to = partition_i;
8307 : }
8308 12590 : else if (partition.layout == single_layout_i || partition.layout == -1)
8309 : ;
8310 : else
8311 : single_layout_i = 0;
8312 23181 : if (single_layout_i == 0)
8313 : return true;
8314 :
8315 20853 : if (single_layout_i != -1
8316 20853 : && !is_compatible_layout (partition, single_layout_i))
8317 : return true;
8318 : }
8319 :
8320 4961 : if (single_layout_i <= 0)
8321 : return true;
8322 :
8323 5077 : for (unsigned partition_i = 0; partition_i < deferred_up_to; ++partition_i)
8324 122 : if (!is_compatible_layout (m_partitions[partition_i],
8325 : single_layout_i))
8326 : return true;
8327 :
8328 12549 : for (unsigned partition_i = 0; partition_i < m_partitions.length ();
8329 : ++partition_i)
8330 : {
8331 7594 : auto &partition = m_partitions[partition_i];
8332 7594 : partition.layout = single_layout_i;
8333 : }
8334 :
8335 : return false;
8336 : }
8337 :
8338 : /* Main entry point for the SLP graph optimization pass. */
8339 :
8340 : void
8341 676294 : vect_optimize_slp_pass::run ()
8342 : {
8343 676294 : build_graph ();
8344 676294 : create_partitions ();
8345 676294 : start_choosing_layouts ();
8346 676294 : if (m_perms.length () > 1)
8347 : {
8348 10609 : if (legitimize ())
8349 : {
8350 5654 : forward_pass ();
8351 5654 : backward_pass ();
8352 : }
8353 10609 : if (dump_enabled_p ())
8354 674 : dump ();
8355 10609 : materialize ();
8356 42853 : while (!m_perms.is_empty ())
8357 21635 : m_perms.pop ().release ();
8358 : }
8359 : else
8360 665685 : remove_redundant_permutations ();
8361 676294 : free_graph (m_slpg);
8362 676294 : build_graph ();
8363 676294 : decide_masked_load_lanes ();
8364 676294 : free_graph (m_slpg);
8365 676294 : }
8366 :
8367 : /* Apply CSE to NODE and its children using BST_MAP. */
8368 :
8369 : static void
8370 5342030 : vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
8371 : {
8372 5342030 : bool put_p = false;
8373 5342030 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
8374 : /* Besides some VEC_PERM_EXPR, two-operator nodes also
8375 : lack scalar stmts and thus CSE doesn't work via bst_map. Ideally
8376 : we'd have sth that works for all internal and external nodes. */
8377 5342030 : && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
8378 : {
8379 3845041 : slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
8380 3845041 : if (leader)
8381 : {
8382 : /* We've visited this node already. */
8383 401865 : if (!*leader || *leader == node)
8384 : return;
8385 :
8386 2776 : if (dump_enabled_p ())
8387 907 : dump_printf_loc (MSG_NOTE, vect_location,
8388 : "re-using SLP tree %p for %p\n",
8389 : (void *)*leader, (void *)node);
8390 2776 : vect_free_slp_tree (node);
8391 2776 : (*leader)->refcnt += 1;
8392 2776 : node = *leader;
8393 2776 : return;
8394 : }
8395 :
8396 : /* Avoid creating a cycle by populating the map only after recursion. */
8397 3443176 : bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
8398 3443176 : node->refcnt += 1;
8399 3443176 : put_p = true;
8400 : /* And recurse. */
8401 : }
8402 :
8403 14774473 : for (slp_tree &child : SLP_TREE_CHILDREN (node))
8404 4316532 : if (child)
8405 3888379 : vect_cse_slp_nodes (bst_map, child);
8406 :
8407 : /* Now record the node for CSE in other siblings. */
8408 4940165 : if (put_p)
8409 3443176 : *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
8410 : }
8411 :
8412 : /* Optimize the SLP graph of VINFO. */
8413 :
8414 : void
8415 1020077 : vect_optimize_slp (vec_info *vinfo)
8416 : {
8417 1020077 : if (vinfo->slp_instances.is_empty ())
8418 : return;
8419 676294 : vect_optimize_slp_pass (vinfo).run ();
8420 :
8421 : /* Apply CSE again to nodes after permute optimization. */
8422 676294 : scalar_stmts_to_slp_tree_map_t *bst_map
8423 676294 : = new scalar_stmts_to_slp_tree_map_t ();
8424 :
8425 3482533 : for (auto inst : vinfo->slp_instances)
8426 1453651 : vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
8427 :
8428 676294 : release_scalar_stmts_to_slp_tree_map (bst_map);
8429 : }
8430 :
8431 : /* Gather loads reachable from the individual SLP graph entries. */
8432 :
8433 : void
8434 1020077 : vect_gather_slp_loads (vec_info *vinfo)
8435 : {
8436 1020077 : unsigned i;
8437 1020077 : slp_instance instance;
8438 2473728 : FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
8439 : {
8440 1453651 : hash_set<slp_tree> visited;
8441 1453651 : vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
8442 : SLP_INSTANCE_TREE (instance), visited);
8443 1453651 : }
8444 1020077 : }
8445 :
8446 : /* For NODE update VF based on the number of lanes and the vector types
8447 : used. */
8448 :
8449 : static void
8450 4206179 : vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
8451 : hash_set<slp_tree> &visited)
8452 : {
8453 4206179 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
8454 1511965 : return;
8455 3057011 : if (visited.add (node))
8456 : return;
8457 :
8458 10220707 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8459 3451359 : vect_update_slp_vf_for_node (child, vf, visited);
8460 :
8461 : /* We do not visit SLP nodes for constants or externals - those neither
8462 : have a vector type set yet (vectorizable_* does this) nor do they
8463 : have max_nunits set. Instead we rely on internal nodes max_nunit
8464 : to cover constant/external operands.
8465 : Note that when we stop using fixed size vectors externs and constants
8466 : shouldn't influence the (minimum) vectorization factor, instead
8467 : vectorizable_* should honor the vectorization factor when trying to
8468 : assign vector types to constants and externals and cause iteration
8469 : to a higher vectorization factor when required. */
8470 2694214 : poly_uint64 node_vf
8471 2694214 : = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
8472 2694214 : vf = force_common_multiple (vf, node_vf);
8473 :
8474 : /* For permute nodes that are fed from externs or constants we have to
8475 : consider their number of lanes as well. Likewise for store-lanes. */
8476 2694214 : if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes)
8477 705503 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8478 189602 : if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
8479 : {
8480 3449 : poly_uint64 child_vf
8481 3449 : = calculate_unrolling_factor (node->max_nunits,
8482 : SLP_TREE_LANES (child));
8483 3449 : vf = force_common_multiple (vf, child_vf);
8484 : }
8485 : }
8486 :
8487 : /* For each possible SLP instance decide whether to SLP it and calculate overall
8488 : unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
8489 : least one instance. */
8490 :
8491 : bool
8492 470799 : vect_make_slp_decision (loop_vec_info loop_vinfo)
8493 : {
8494 470799 : unsigned int i;
8495 470799 : poly_uint64 unrolling_factor = 1;
8496 470799 : const vec<slp_instance> &slp_instances
8497 : = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8498 470799 : slp_instance instance;
8499 470799 : int decided_to_slp = 0;
8500 :
8501 470799 : DUMP_VECT_SCOPE ("vect_make_slp_decision");
8502 :
8503 470799 : hash_set<slp_tree> visited;
8504 1225619 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
8505 : {
8506 754820 : slp_tree root = SLP_INSTANCE_TREE (instance);
8507 :
8508 : /* All unroll factors have the form:
8509 :
8510 : GET_MODE_SIZE (vinfo->vector_mode) * X
8511 :
8512 : for some rational X, so they must have a common multiple. */
8513 754820 : vect_update_slp_vf_for_node (root, unrolling_factor, visited);
8514 :
8515 : /* If all instances ended up with vector(1) T roots make sure to
8516 : not vectorize. RVV for example relies on loop vectorization
8517 : when some instances are essentially kept scalar. See PR121048. */
8518 754820 : if (SLP_TREE_VECTYPE (root)
8519 754820 : && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
8520 617794 : decided_to_slp++;
8521 : }
8522 :
8523 470799 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = unrolling_factor;
8524 :
8525 470799 : if (decided_to_slp && dump_enabled_p ())
8526 : {
8527 19042 : dump_printf_loc (MSG_NOTE, vect_location,
8528 : "Decided to SLP %d instances. Unrolling factor ",
8529 : decided_to_slp);
8530 19042 : dump_dec (MSG_NOTE, unrolling_factor);
8531 19042 : dump_printf (MSG_NOTE, "\n");
8532 : }
8533 :
8534 470799 : return (decided_to_slp > 0);
8535 470799 : }
8536 :
8537 : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
8538 :
8539 2190696 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
8540 : : vec_info (vec_info::bb, shared),
8541 2190696 : roots (vNULL)
8542 : {
8543 : /* The region we are operating on. bbs[0] is the entry, excluding
8544 : its PHI nodes. In the future we might want to track an explicit
8545 : entry edge to cover bbs[0] PHI nodes and have a region entry
8546 : insert location. */
8547 2190696 : bbs = _bbs.address ();
8548 2190696 : nbbs = _bbs.length ();
8549 :
8550 17515250 : for (unsigned i = 0; i < nbbs; ++i)
8551 : {
8552 15324554 : if (i != 0)
8553 19924579 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
8554 6790721 : gsi_next (&si))
8555 : {
8556 6790721 : gphi *phi = si.phi ();
8557 6790721 : gimple_set_uid (phi, 0);
8558 6790721 : add_stmt (phi);
8559 : }
8560 30649108 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
8561 134374416 : !gsi_end_p (gsi); gsi_next (&gsi))
8562 : {
8563 119049862 : gimple *stmt = gsi_stmt (gsi);
8564 119049862 : gimple_set_uid (stmt, 0);
8565 119049862 : if (is_gimple_debug (stmt))
8566 74188716 : continue;
8567 44861146 : add_stmt (stmt);
8568 : }
8569 : }
8570 2190696 : }
8571 :
8572 :
8573 : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
8574 : stmts in the basic block. */
8575 :
8576 2190696 : _bb_vec_info::~_bb_vec_info ()
8577 : {
8578 : /* Reset region marker. */
8579 17515250 : for (unsigned i = 0; i < nbbs; ++i)
8580 : {
8581 15324554 : if (i != 0)
8582 19940358 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
8583 6806500 : gsi_next (&si))
8584 : {
8585 6806500 : gphi *phi = si.phi ();
8586 6806500 : gimple_set_uid (phi, -1);
8587 : }
8588 30649108 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
8589 134315052 : !gsi_end_p (gsi); gsi_next (&gsi))
8590 : {
8591 118990498 : gimple *stmt = gsi_stmt (gsi);
8592 118990498 : gimple_set_uid (stmt, -1);
8593 : }
8594 : }
8595 :
8596 3409927 : for (unsigned i = 0; i < roots.length (); ++i)
8597 : {
8598 1219231 : roots[i].stmts.release ();
8599 1219231 : roots[i].roots.release ();
8600 1219231 : roots[i].remain.release ();
8601 : }
8602 2190696 : roots.release ();
8603 2190696 : }
8604 :
8605 : /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
8606 : given then that child nodes have already been processed, and that
8607 : their def types currently match their SLP node's def type. */
8608 :
8609 : static bool
8610 2802904 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
8611 : slp_instance node_instance,
8612 : stmt_vector_for_cost *cost_vec)
8613 : {
8614 : /* Handle purely internal nodes. */
8615 2802904 : if (SLP_TREE_PERMUTE_P (node))
8616 : {
8617 122737 : if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
8618 : return false;
8619 :
8620 : stmt_vec_info slp_stmt_info;
8621 : unsigned int i;
8622 323753 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
8623 : {
8624 202343 : if (slp_stmt_info
8625 196802 : && STMT_VINFO_LIVE_P (slp_stmt_info)
8626 202343 : && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
8627 : node_instance, i,
8628 : false, cost_vec))
8629 : return false;
8630 : }
8631 121410 : SLP_TREE_TYPE (node) = permute_info_type;
8632 121410 : return true;
8633 : }
8634 :
8635 2680167 : return vect_analyze_stmt (vinfo, node, node_instance, cost_vec);
8636 : }
8637 :
8638 : static int
8639 1848484 : sort_ints (const void *a_, const void *b_)
8640 : {
8641 1848484 : int a = *(const int *)a_;
8642 1848484 : int b = *(const int *)b_;
8643 1848484 : return a - b;
8644 : }
8645 :
8646 : /* Verify if we can externalize a set of internal defs. */
8647 :
8648 : static bool
8649 379303 : vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
8650 : {
8651 : /* Constant generation uses get_later_stmt which can only handle
8652 : defs from the same BB or a set of defs that can be ordered
8653 : with a dominance query. */
8654 379303 : basic_block bb = NULL;
8655 379303 : bool all_same = true;
8656 379303 : auto_vec<int> bbs;
8657 758606 : bbs.reserve_exact (stmts.length ());
8658 2052041 : for (stmt_vec_info stmt : stmts)
8659 : {
8660 914132 : if (!stmt)
8661 : return false;
8662 914132 : else if (!bb)
8663 379303 : bb = gimple_bb (stmt->stmt);
8664 534829 : else if (gimple_bb (stmt->stmt) != bb)
8665 172529 : all_same = false;
8666 914132 : bbs.quick_push (gimple_bb (stmt->stmt)->index);
8667 : }
8668 379303 : if (all_same)
8669 : return true;
8670 :
8671 : /* Produce a vector of unique BB indexes for the defs. */
8672 129235 : bbs.qsort (sort_ints);
8673 : unsigned i, j;
8674 314914 : for (i = 1, j = 1; i < bbs.length (); ++i)
8675 185679 : if (bbs[i] != bbs[j-1])
8676 137991 : bbs[j++] = bbs[i];
8677 129235 : gcc_assert (j >= 2);
8678 129235 : bbs.truncate (j);
8679 :
8680 258470 : if (bbs.length () == 2)
8681 125729 : return (dominated_by_p (CDI_DOMINATORS,
8682 125729 : BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
8683 125729 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
8684 244788 : || dominated_by_p (CDI_DOMINATORS,
8685 119059 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
8686 119059 : BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
8687 :
8688 : /* ??? For more than two BBs we can sort the vector and verify the
8689 : result is a total order. But we can't use vec::qsort with a
8690 : compare function using a dominance query since there's no way to
8691 : signal failure and any fallback for an unordered pair would
8692 : fail qsort_chk later.
8693 : For now simply hope that ordering after BB index provides the
8694 : best candidate total order. If required we can implement our
8695 : own mergesort or export an entry without checking. */
8696 395018 : for (unsigned i = 1; i < bbs.length (); ++i)
8697 12238 : if (!dominated_by_p (CDI_DOMINATORS,
8698 12238 : BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
8699 12238 : BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
8700 : return false;
8701 :
8702 : return true;
8703 379303 : }
8704 :
8705 : /* Try to build NODE from scalars, returning true on success.
8706 : NODE_INSTANCE is the SLP instance that contains NODE. */
8707 :
8708 : static bool
8709 560330 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
8710 : slp_instance node_instance)
8711 : {
8712 560330 : stmt_vec_info stmt_info;
8713 560330 : unsigned int i;
8714 :
8715 560330 : if (!is_a <bb_vec_info> (vinfo)
8716 70693 : || node == SLP_INSTANCE_TREE (node_instance)
8717 22231 : || !SLP_TREE_SCALAR_STMTS (node).exists ()
8718 22190 : || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
8719 : /* Force the mask use to be built from scalars instead. */
8720 20013 : || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
8721 580136 : || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
8722 540524 : return false;
8723 :
8724 19806 : if (dump_enabled_p ())
8725 76 : dump_printf_loc (MSG_NOTE, vect_location,
8726 : "Building vector operands of %p from scalars instead\n",
8727 : (void *) node);
8728 :
8729 : /* Don't remove and free the child nodes here, since they could be
8730 : referenced by other structures. The analysis and scheduling phases
8731 : (need to) ignore child nodes of anything that isn't vect_internal_def. */
8732 19806 : unsigned int group_size = SLP_TREE_LANES (node);
8733 19806 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
8734 : /* Invariants get their vector type from the uses. */
8735 19806 : SLP_TREE_VECTYPE (node) = NULL_TREE;
8736 19806 : SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
8737 19806 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8738 68868 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8739 : {
8740 49062 : tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
8741 49062 : SLP_TREE_SCALAR_OPS (node)[i] = lhs;
8742 : }
8743 : return true;
8744 : }
8745 :
8746 : /* Return true if all elements of the slice are the same. */
8747 : bool
8748 479014 : vect_scalar_ops_slice::all_same_p () const
8749 : {
8750 526602 : for (unsigned int i = 1; i < length; ++i)
8751 444462 : if (!operand_equal_p (op (0), op (i)))
8752 : return false;
8753 : return true;
8754 : }
8755 :
8756 : hashval_t
8757 404172 : vect_scalar_ops_slice_hash::hash (const value_type &s)
8758 : {
8759 404172 : hashval_t hash = 0;
8760 1555086 : for (unsigned i = 0; i < s.length; ++i)
8761 1150914 : hash = iterative_hash_expr (s.op (i), hash);
8762 404172 : return hash;
8763 : }
8764 :
8765 : bool
8766 221773 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
8767 : const compare_type &s2)
8768 : {
8769 221773 : if (s1.length != s2.length)
8770 : return false;
8771 385983 : for (unsigned i = 0; i < s1.length; ++i)
8772 336033 : if (!operand_equal_p (s1.op (i), s2.op (i)))
8773 : return false;
8774 : return true;
8775 : }
8776 :
8777 : /* Compute the prologue cost for invariant or constant operands represented
8778 : by NODE. */
8779 :
8780 : static void
8781 1104383 : vect_prologue_cost_for_slp (vec_info *vinfo, slp_tree node,
8782 : stmt_vector_for_cost *cost_vec)
8783 : {
8784 : /* There's a special case of an existing vector, that costs nothing. */
8785 1104383 : if (SLP_TREE_SCALAR_OPS (node).length () == 0
8786 1104383 : && !SLP_TREE_VEC_DEFS (node).is_empty ())
8787 1570 : return;
8788 : /* Without looking at the actual initializer a vector of
8789 : constants can be implemented as load from the constant pool.
8790 : When all elements are the same we can use a splat. */
8791 1102813 : tree vectype = SLP_TREE_VECTYPE (node);
8792 1102813 : unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
8793 1102813 : unsigned HOST_WIDE_INT const_nunits;
8794 1102813 : unsigned nelt_limit;
8795 1102813 : unsigned nvectors = vect_get_num_copies (vinfo, node);
8796 1102813 : auto ops = &SLP_TREE_SCALAR_OPS (node);
8797 1102813 : auto_vec<unsigned int> starts (nvectors);
8798 1102813 : if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
8799 1102813 : && ! multiple_p (const_nunits, group_size))
8800 : {
8801 63602 : nelt_limit = const_nunits;
8802 63602 : hash_set<vect_scalar_ops_slice_hash> vector_ops;
8803 264979 : for (unsigned int i = 0; i < nvectors; ++i)
8804 201377 : if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
8805 151427 : starts.quick_push (i * nelt_limit);
8806 63602 : }
8807 : else
8808 : {
8809 : /* If either the vector has variable length or the vectors
8810 : are composed of repeated whole groups we only need to
8811 : cost construction once. All vectors will be the same. */
8812 1039211 : nelt_limit = group_size;
8813 1039211 : starts.quick_push (0);
8814 : }
8815 : /* ??? We're just tracking whether vectors in a single node are the same.
8816 : Ideally we'd do something more global. */
8817 1102813 : bool passed = false;
8818 4499077 : for (unsigned int start : starts)
8819 : {
8820 1190638 : vect_cost_for_stmt kind;
8821 1190638 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
8822 : kind = vector_load;
8823 479014 : else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
8824 : kind = scalar_to_vec;
8825 : else
8826 396874 : kind = vec_construct;
8827 : /* The target cost hook has no idea which part of the SLP node
8828 : we are costing so avoid passing it down more than once. Pass
8829 : it to the first vec_construct or scalar_to_vec part since for those
8830 : the x86 backend tries to account for GPR to XMM register moves. */
8831 1190638 : record_stmt_cost (cost_vec, 1, kind, nullptr,
8832 1190638 : (kind != vector_load && !passed) ? node : nullptr,
8833 : vectype, 0, vect_prologue);
8834 1190638 : if (kind != vector_load)
8835 479014 : passed = true;
8836 : }
8837 1102813 : }
8838 :
8839 : /* Analyze statements contained in SLP tree NODE after recursively analyzing
8840 : the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
8841 :
8842 : Return true if the operations are supported. */
8843 :
8844 : static bool
8845 5178027 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
8846 : slp_instance node_instance,
8847 : hash_set<slp_tree> &visited_set,
8848 : vec<slp_tree> &visited_vec,
8849 : stmt_vector_for_cost *cost_vec)
8850 : {
8851 5178027 : int i, j;
8852 5178027 : slp_tree child;
8853 :
8854 : /* Assume we can code-generate all invariants. */
8855 5178027 : if (!node
8856 4807011 : || SLP_TREE_DEF_TYPE (node) == vect_constant_def
8857 4040464 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8858 : return true;
8859 :
8860 3489268 : if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
8861 : {
8862 9 : if (dump_enabled_p ())
8863 0 : dump_printf_loc (MSG_NOTE, vect_location,
8864 : "Failed cyclic SLP reference in %p\n", (void *) node);
8865 9 : return false;
8866 : }
8867 3489259 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
8868 :
8869 : /* If we already analyzed the exact same set of scalar stmts we're done.
8870 : We share the generated vector stmts for those. */
8871 3489259 : if (visited_set.add (node))
8872 : return true;
8873 3114118 : visited_vec.safe_push (node);
8874 :
8875 3114118 : bool res = true;
8876 3114118 : unsigned visited_rec_start = visited_vec.length ();
8877 3114118 : unsigned cost_vec_rec_start = cost_vec->length ();
8878 3114118 : bool seen_non_constant_child = false;
8879 6685376 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8880 : {
8881 3882288 : res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
8882 : visited_set, visited_vec,
8883 : cost_vec);
8884 3882288 : if (!res)
8885 : break;
8886 3571258 : if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
8887 3571258 : seen_non_constant_child = true;
8888 : }
8889 : /* We're having difficulties scheduling nodes with just constant
8890 : operands and no scalar stmts since we then cannot compute a stmt
8891 : insertion place. */
8892 3114118 : if (res
8893 3114118 : && !seen_non_constant_child
8894 3114118 : && SLP_TREE_SCALAR_STMTS (node).is_empty ())
8895 : {
8896 184 : if (dump_enabled_p ())
8897 6 : dump_printf_loc (MSG_NOTE, vect_location,
8898 : "Cannot vectorize all-constant op node %p\n",
8899 : (void *) node);
8900 : res = false;
8901 : }
8902 :
8903 3113934 : if (res)
8904 2802904 : res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
8905 : cost_vec);
8906 : /* If analysis failed we have to pop all recursive visited nodes
8907 : plus ourselves. */
8908 3114118 : if (!res)
8909 : {
8910 2807062 : while (visited_vec.length () >= visited_rec_start)
8911 843201 : visited_set.remove (visited_vec.pop ());
8912 560330 : cost_vec->truncate (cost_vec_rec_start);
8913 : }
8914 :
8915 : /* When the node can be vectorized cost invariant nodes it references.
8916 : This is not done in DFS order to allow the referring node
8917 : vectorizable_* calls to nail down the invariant nodes vector type
8918 : and possibly unshare it if it needs a different vector type than
8919 : other referrers. */
8920 3114118 : if (res)
8921 5811370 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
8922 3257582 : if (child
8923 2952223 : && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
8924 2952223 : || SLP_TREE_DEF_TYPE (child) == vect_external_def)
8925 : /* Perform usual caching, note code-generation still
8926 : code-gens these nodes multiple times but we expect
8927 : to CSE them later. */
8928 4452153 : && !visited_set.add (child))
8929 : {
8930 1149755 : visited_vec.safe_push (child);
8931 : /* ??? After auditing more code paths make a "default"
8932 : and push the vector type from NODE to all children
8933 : if it is not already set. */
8934 : /* Compute the number of vectors to be generated. */
8935 1149755 : tree vector_type = SLP_TREE_VECTYPE (child);
8936 1149755 : if (!vector_type)
8937 : {
8938 : /* Masked loads can have an undefined (default SSA definition)
8939 : else operand. We do not need to cost it. */
8940 45372 : vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
8941 46807 : if (SLP_TREE_TYPE (node) == load_vec_info_type
8942 46807 : && ((ops.length ()
8943 1435 : && TREE_CODE (ops[0]) == SSA_NAME
8944 0 : && SSA_NAME_IS_DEFAULT_DEF (ops[0])
8945 0 : && VAR_P (SSA_NAME_VAR (ops[0])))
8946 1435 : || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
8947 1435 : continue;
8948 :
8949 : /* For shifts with a scalar argument we don't need
8950 : to cost or code-generate anything.
8951 : ??? Represent this more explicitly. */
8952 43937 : gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type
8953 : && j == 1);
8954 43937 : continue;
8955 43937 : }
8956 :
8957 : /* And cost them. */
8958 1104383 : vect_prologue_cost_for_slp (vinfo, child, cost_vec);
8959 : }
8960 :
8961 : /* If this node or any of its children can't be vectorized, try pruning
8962 : the tree here rather than felling the whole thing. */
8963 560330 : if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
8964 : {
8965 : /* We'll need to revisit this for invariant costing and number
8966 : of vectorized stmt setting. */
8967 : res = true;
8968 : }
8969 :
8970 : return res;
8971 : }
8972 :
8973 : /* Mark lanes of NODE that are live outside of the basic-block vectorized
8974 : region and that can be vectorized using vectorizable_live_operation
8975 : with STMT_VINFO_LIVE_P. Not handled live operations will cause the
8976 : scalar code computing it to be retained. */
8977 :
8978 : static void
8979 909711 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
8980 : slp_instance instance,
8981 : stmt_vector_for_cost *cost_vec,
8982 : hash_set<stmt_vec_info> &svisited,
8983 : hash_set<slp_tree> &visited)
8984 : {
8985 909711 : if (visited.add (node))
8986 41691 : return;
8987 :
8988 868020 : unsigned i;
8989 868020 : stmt_vec_info stmt_info;
8990 868020 : stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
8991 3144472 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8992 : {
8993 2276452 : if (!stmt_info || svisited.contains (stmt_info))
8994 55511 : continue;
8995 2251277 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8996 2251277 : if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
8997 12044 : && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
8998 : /* Only the pattern root stmt computes the original scalar value. */
8999 8963 : continue;
9000 2242314 : if (!PURE_SLP_STMT (orig_stmt_info))
9001 : /* Iff the stmt is not part of the vector coverage because it or
9002 : uses of it are used by SLP graph leafs as extern input there is
9003 : no point in trying to live code-generate from a vector stmt as
9004 : the scalar stmt will survive anyway. */
9005 21373 : continue;
9006 2220941 : bool mark_visited = true;
9007 2220941 : gimple *orig_stmt = orig_stmt_info->stmt;
9008 2220941 : ssa_op_iter op_iter;
9009 2220941 : def_operand_p def_p;
9010 4934481 : FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
9011 : {
9012 : /* We have to verify whether we can insert the lane extract
9013 : before all uses. The following is a conservative approximation.
9014 : We cannot put this into vectorizable_live_operation because
9015 : iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
9016 : doesn't work.
9017 : Note that while the fact that we emit code for loads at the
9018 : first load should make this a non-problem leafs we construct
9019 : from scalars are vectorized after the last scalar def.
9020 : ??? If we'd actually compute the insert location during
9021 : analysis we could use sth less conservative than the last
9022 : scalar stmt in the node for the dominance check. */
9023 : /* ??? What remains is "live" uses in vector CTORs in the same
9024 : SLP graph which is where those uses can end up code-generated
9025 : right after their definition instead of close to their original
9026 : use. But that would restrict us to code-generate lane-extracts
9027 : from the latest stmt in a node. So we compensate for this
9028 : during code-generation, simply not replacing uses for those
9029 : hopefully rare cases. */
9030 492599 : imm_use_iterator use_iter;
9031 492599 : gimple *use_stmt;
9032 492599 : stmt_vec_info use_stmt_info;
9033 :
9034 492599 : bool live_p = false;
9035 492599 : bool can_insert = true;
9036 1893164 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
9037 923637 : if (!is_gimple_debug (use_stmt)
9038 923637 : && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
9039 689036 : || !PURE_SLP_STMT (use_stmt_info)))
9040 : {
9041 144350 : live_p = true;
9042 144350 : if (!vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
9043 : {
9044 15671 : if (dump_enabled_p ())
9045 46 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9046 : "Cannot determine insertion place for "
9047 : "lane extract\n");
9048 : can_insert = false;
9049 : break;
9050 : }
9051 492599 : }
9052 492599 : if (live_p && can_insert)
9053 : {
9054 : /* Only record a live stmt when we can replace all uses. We
9055 : record from which SLP tree we vectorize the uses, so we'll
9056 : cost once and can deal with the case that not all SLP nodes
9057 : may be suitable for code-generation of all live uses.
9058 : ??? But we never split up the work between multiple SLP
9059 : nodes. */
9060 64415 : STMT_VINFO_LIVE_P (stmt_info) = true;
9061 64415 : if (!vectorizable_live_operation (bb_vinfo, stmt_info, node,
9062 : instance, i, false, cost_vec))
9063 : {
9064 0 : STMT_VINFO_LIVE_P (stmt_info) = false;
9065 0 : mark_visited = false;
9066 : }
9067 : }
9068 : }
9069 2220941 : if (mark_visited)
9070 2220941 : svisited.add (stmt_info);
9071 : }
9072 :
9073 : slp_tree child;
9074 2507736 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9075 877809 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9076 232701 : vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
9077 : svisited, visited);
9078 : }
9079 :
9080 : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
9081 : are live outside of the basic-block vectorized region and that can be
9082 : vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
9083 :
9084 : static void
9085 234604 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
9086 : {
9087 234604 : if (bb_vinfo->slp_instances.is_empty ())
9088 0 : return;
9089 :
9090 234604 : hash_set<slp_tree> visited;
9091 234604 : hash_set<stmt_vec_info> svisited;
9092 1380822 : for (slp_instance instance : bb_vinfo->slp_instances)
9093 : {
9094 677010 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9095 28847 : STMT_VINFO_LIVE_P (SLP_INSTANCE_ROOT_STMTS (instance)[0]) = true;
9096 677010 : vect_location = instance->location ();
9097 677010 : vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
9098 : instance, &instance->cost_vec,
9099 : svisited, visited);
9100 : }
9101 234604 : }
9102 :
9103 : /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
9104 :
9105 : static bool
9106 74335 : vectorizable_bb_reduc_epilogue (slp_instance instance,
9107 : stmt_vector_for_cost *cost_vec)
9108 : {
9109 74335 : gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
9110 74335 : enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
9111 74335 : if (reduc_code == MINUS_EXPR)
9112 0 : reduc_code = PLUS_EXPR;
9113 74335 : internal_fn reduc_fn;
9114 74335 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
9115 74335 : if (!vectype
9116 74323 : || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9117 74323 : || reduc_fn == IFN_LAST
9118 74323 : || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
9119 109421 : || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
9120 35086 : TREE_TYPE (vectype)))
9121 : {
9122 49569 : if (dump_enabled_p ())
9123 277 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9124 : "not vectorized: basic block reduction epilogue "
9125 : "operation unsupported.\n");
9126 49569 : return false;
9127 : }
9128 :
9129 : /* There's no way to cost a horizontal vector reduction via REDUC_FN so
9130 : cost log2 vector operations plus shuffles and one extraction. */
9131 24766 : unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
9132 24766 : record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
9133 : vectype, 0, vect_body);
9134 24766 : record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
9135 : vectype, 0, vect_body);
9136 24766 : record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
9137 : vectype, 0, vect_body);
9138 :
9139 : /* Since we replace all stmts of a possibly longer scalar reduction
9140 : chain account for the extra scalar stmts for that. */
9141 24766 : if (!instance->remain_defs.is_empty ())
9142 20102 : record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
9143 10051 : instance->root_stmts[0], 0, vect_body);
9144 : return true;
9145 : }
9146 :
9147 : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
9148 : and recurse to children. */
9149 :
9150 : static void
9151 189392 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
9152 : hash_set<slp_tree> &visited)
9153 : {
9154 189392 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
9155 189392 : || visited.add (node))
9156 83380 : return;
9157 :
9158 : stmt_vec_info stmt;
9159 : unsigned i;
9160 359663 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
9161 253651 : if (stmt)
9162 258974 : roots.remove (vect_orig_stmt (stmt));
9163 :
9164 : slp_tree child;
9165 234476 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9166 128464 : if (child)
9167 127074 : vect_slp_prune_covered_roots (child, roots, visited);
9168 : }
9169 :
9170 : /* Analyze statements in SLP instances of VINFO. Return true if the
9171 : operations are supported. */
9172 :
9173 : bool
9174 657820 : vect_slp_analyze_operations (vec_info *vinfo)
9175 : {
9176 657820 : slp_instance instance;
9177 657820 : int i;
9178 :
9179 657820 : DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
9180 :
9181 657820 : hash_set<slp_tree> visited;
9182 1715057 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
9183 : {
9184 1295739 : auto_vec<slp_tree> visited_vec;
9185 1295739 : stmt_vector_for_cost cost_vec;
9186 1295739 : cost_vec.create (2);
9187 1295739 : if (is_a <bb_vec_info> (vinfo))
9188 776571 : vect_location = instance->location ();
9189 1295739 : if (!vect_slp_analyze_node_operations (vinfo,
9190 : SLP_INSTANCE_TREE (instance),
9191 : instance, visited, visited_vec,
9192 : &cost_vec)
9193 : /* CTOR instances require vectorized defs for the SLP tree root. */
9194 1066236 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
9195 5611 : && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
9196 : != vect_internal_def
9197 : /* Make sure we vectorized with the expected type. */
9198 5611 : || !useless_type_conversion_p
9199 5611 : (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
9200 : (instance->root_stmts[0]->stmt))),
9201 5611 : TREE_TYPE (SLP_TREE_VECTYPE
9202 : (SLP_INSTANCE_TREE (instance))))))
9203 : /* Check we can vectorize the reduction. */
9204 1066221 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
9205 74335 : && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
9206 : /* Check we can vectorize the gcond. */
9207 2312391 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
9208 60059 : && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
9209 60059 : SLP_INSTANCE_ROOT_STMTS (instance)[0],
9210 : NULL,
9211 : SLP_INSTANCE_TREE (instance),
9212 : &cost_vec)))
9213 : {
9214 336548 : cost_vec.release ();
9215 336548 : slp_tree node = SLP_INSTANCE_TREE (instance);
9216 336548 : stmt_vec_info stmt_info;
9217 336548 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9218 254157 : stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
9219 82391 : else if (!SLP_TREE_SCALAR_STMTS (node).is_empty ()
9220 82391 : && SLP_TREE_SCALAR_STMTS (node)[0])
9221 : stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9222 : else
9223 0 : stmt_info = SLP_TREE_REPRESENTATIVE (node);
9224 336548 : if (is_a <loop_vec_info> (vinfo))
9225 : {
9226 238502 : if (dump_enabled_p ())
9227 6477 : dump_printf_loc (MSG_NOTE, vect_location,
9228 : "unsupported SLP instance starting from: %G",
9229 : stmt_info->stmt);
9230 238502 : return false;
9231 : }
9232 98046 : if (dump_enabled_p ())
9233 331 : dump_printf_loc (MSG_NOTE, vect_location,
9234 : "removing SLP instance operations starting from: %G",
9235 : stmt_info->stmt);
9236 538594 : while (!visited_vec.is_empty ())
9237 : {
9238 440548 : slp_tree node = visited_vec.pop ();
9239 440548 : SLP_TREE_TYPE (node) = undef_vec_info_type;
9240 440548 : if (node->data)
9241 : {
9242 12290 : delete node->data;
9243 12290 : node->data = nullptr;
9244 : }
9245 440548 : visited.remove (node);
9246 : }
9247 98046 : vect_free_slp_instance (instance);
9248 98046 : vinfo->slp_instances.ordered_remove (i);
9249 : }
9250 : else
9251 : {
9252 959191 : i++;
9253 959191 : if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
9254 : {
9255 280666 : add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
9256 280666 : cost_vec.release ();
9257 : }
9258 : else
9259 : /* For BB vectorization remember the SLP graph entry
9260 : cost for later. */
9261 678525 : instance->cost_vec = cost_vec;
9262 : }
9263 1295739 : }
9264 :
9265 : /* Now look for SLP instances with a root that are covered by other
9266 : instances and remove them. */
9267 419318 : hash_set<stmt_vec_info> roots;
9268 1729986 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
9269 924298 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9270 32948 : roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
9271 419318 : if (!roots.is_empty ())
9272 : {
9273 13128 : visited.empty ();
9274 75446 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
9275 62318 : vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
9276 : visited);
9277 75446 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
9278 62318 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
9279 32948 : && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
9280 : {
9281 1515 : stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
9282 1515 : if (dump_enabled_p ())
9283 20 : dump_printf_loc (MSG_NOTE, vect_location,
9284 : "removing SLP instance operations starting "
9285 : "from: %G", root->stmt);
9286 1515 : vect_free_slp_instance (instance);
9287 1515 : vinfo->slp_instances.ordered_remove (i);
9288 : }
9289 : else
9290 60803 : ++i;
9291 : }
9292 :
9293 838636 : return !vinfo->slp_instances.is_empty ();
9294 1077138 : }
9295 :
9296 : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
9297 : closing the eventual chain. */
9298 :
9299 : static slp_instance
9300 742532 : get_ultimate_leader (slp_instance instance,
9301 : hash_map<slp_instance, slp_instance> &instance_leader)
9302 : {
9303 742532 : auto_vec<slp_instance *, 8> chain;
9304 742532 : slp_instance *tem;
9305 820011 : while (*(tem = instance_leader.get (instance)) != instance)
9306 : {
9307 77479 : chain.safe_push (tem);
9308 77479 : instance = *tem;
9309 : }
9310 820011 : while (!chain.is_empty ())
9311 77479 : *chain.pop () = instance;
9312 742532 : return instance;
9313 742532 : }
9314 :
9315 : namespace {
9316 : /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
9317 : KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
9318 : for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
9319 :
9320 : INSTANCE_LEADER is as for get_ultimate_leader. */
9321 :
9322 : template<typename T>
9323 : bool
9324 3288225 : vect_map_to_instance (slp_instance instance, T key,
9325 : hash_map<T, slp_instance> &key_to_instance,
9326 : hash_map<slp_instance, slp_instance> &instance_leader)
9327 : {
9328 : bool existed_p;
9329 3288225 : slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
9330 3288225 : if (!existed_p)
9331 : ;
9332 174586 : else if (key_instance != instance)
9333 : {
9334 : /* If we're running into a previously marked key make us the
9335 : leader of the current ultimate leader. This keeps the
9336 : leader chain acyclic and works even when the current instance
9337 : connects two previously independent graph parts. */
9338 65522 : slp_instance key_leader
9339 65522 : = get_ultimate_leader (key_instance, instance_leader);
9340 65522 : if (key_leader != instance)
9341 19457 : instance_leader.put (key_leader, instance);
9342 : }
9343 3288225 : key_instance = instance;
9344 3288225 : return existed_p;
9345 : }
9346 : }
9347 :
9348 : /* Worker of vect_bb_partition_graph, recurse on NODE. */
9349 :
9350 : static void
9351 909711 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
9352 : slp_instance instance, slp_tree node,
9353 : hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
9354 : hash_map<slp_tree, slp_instance> &node_to_instance,
9355 : hash_map<slp_instance, slp_instance> &instance_leader)
9356 : {
9357 909711 : stmt_vec_info stmt_info;
9358 909711 : unsigned i;
9359 :
9360 3288225 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9361 2378514 : if (stmt_info)
9362 2378514 : vect_map_to_instance (instance, stmt_info, stmt_to_instance,
9363 : instance_leader);
9364 :
9365 909711 : if (vect_map_to_instance (instance, node, node_to_instance,
9366 : instance_leader))
9367 909711 : return;
9368 :
9369 : slp_tree child;
9370 1745829 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9371 877809 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9372 232701 : vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
9373 : node_to_instance, instance_leader);
9374 : }
9375 :
9376 : /* Partition the SLP graph into pieces that can be costed independently. */
9377 :
9378 : static void
9379 234604 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
9380 : {
9381 234604 : DUMP_VECT_SCOPE ("vect_bb_partition_graph");
9382 :
9383 : /* First walk the SLP graph assigning each involved scalar stmt a
9384 : corresponding SLP graph entry and upon visiting a previously
9385 : marked stmt, make the stmts leader the current SLP graph entry. */
9386 234604 : hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
9387 234604 : hash_map<slp_tree, slp_instance> node_to_instance;
9388 234604 : hash_map<slp_instance, slp_instance> instance_leader;
9389 234604 : slp_instance instance;
9390 911614 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
9391 : {
9392 677010 : instance_leader.put (instance, instance);
9393 677010 : vect_bb_partition_graph_r (bb_vinfo,
9394 : instance, SLP_INSTANCE_TREE (instance),
9395 : stmt_to_instance, node_to_instance,
9396 : instance_leader);
9397 : }
9398 :
9399 : /* Then collect entries to each independent subgraph. */
9400 1146218 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
9401 : {
9402 677010 : slp_instance leader = get_ultimate_leader (instance, instance_leader);
9403 677010 : leader->subgraph_entries.safe_push (instance);
9404 677010 : if (dump_enabled_p ()
9405 677010 : && leader != instance)
9406 69 : dump_printf_loc (MSG_NOTE, vect_location,
9407 : "instance %p is leader of %p\n",
9408 : (void *) leader, (void *) instance);
9409 : }
9410 234604 : }
9411 :
9412 : /* Compute the scalar cost of the SLP node NODE and its children
9413 : and return it. Do not account defs that are marked in LIFE and
9414 : update LIFE according to uses of NODE. */
9415 :
9416 : static void
9417 673512 : vect_bb_slp_scalar_cost (bb_vec_info vinfo,
9418 : vec<stmt_vec_info> &worklist,
9419 : stmt_vector_for_cost *cost_vec,
9420 : hash_set<stmt_vec_info> &visited)
9421 : {
9422 3125389 : while (!worklist.is_empty ())
9423 : {
9424 2451877 : stmt_vec_info stmt = worklist.pop ();
9425 2737996 : if (!PURE_SLP_STMT (stmt))
9426 301567 : continue;
9427 :
9428 : /* When the stmt is live but not actually vectorized we have
9429 : to keep the feeding scalar defs. */
9430 2168646 : if (!STMT_VINFO_LIVE_P (vect_stmt_to_vectorize (stmt)))
9431 : {
9432 2102918 : bool live_p = false;
9433 2102918 : ssa_op_iter op_iter;
9434 2102918 : def_operand_p def_p;
9435 4603190 : FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt->stmt, op_iter, SSA_OP_DEF)
9436 : {
9437 397354 : imm_use_iterator use_iter;
9438 397354 : gimple *use_stmt;
9439 1435577 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
9440 640869 : if (!is_gimple_debug (use_stmt))
9441 : {
9442 473186 : stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
9443 473186 : if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
9444 : {
9445 24123 : if (dump_enabled_p ())
9446 : {
9447 36 : dump_printf_loc (MSG_NOTE, vect_location,
9448 : "stmt considered live: %G",
9449 : stmt->stmt);
9450 36 : dump_printf_loc (MSG_NOTE, vect_location,
9451 : "because of use in: %G",
9452 : use_stmt);
9453 : }
9454 : live_p = true;
9455 : }
9456 397354 : }
9457 : }
9458 2102918 : if (live_p)
9459 15448 : continue;
9460 : }
9461 :
9462 : /* The following assert verifies that vect_bb_partition_graph
9463 : partitions the SLP graph in a way that each scalar stmt of
9464 : the coverage of the SLP graph belongs to exactly one subgraph.
9465 : ??? This is currently not guaranteed since the function
9466 : works purely on SLP_TREE_SCALAR_STMTS, resulting in the assert
9467 : tripping or scalar stmts costed multiple times, making vectorization
9468 : more profitable than it really is. */
9469 : /* gcc_checking_assert (!gimple_visited_p (stmt->stmt)); */
9470 :
9471 2150310 : if (vect_nop_conversion_p (stmt))
9472 : ;
9473 : /* For single-argument PHIs assume coalescing which means zero
9474 : cost for the scalar and the vector PHIs. This avoids
9475 : artificially favoring the vector path (but may pessimize it
9476 : in some cases). */
9477 2129210 : else if (is_a <gphi *> (stmt->stmt)
9478 2129210 : && gimple_phi_num_args (as_a <gphi *> (stmt->stmt)) == 1)
9479 : ;
9480 : else
9481 : {
9482 2120495 : vect_cost_for_stmt kind;
9483 2120495 : if (STMT_VINFO_DATA_REF (stmt))
9484 : {
9485 1946884 : data_reference_p dr = STMT_VINFO_DATA_REF (stmt);
9486 1946884 : tree base = get_base_address (DR_REF (dr));
9487 : /* When the scalar access is to a non-global not
9488 : address-taken decl that is not BLKmode assume we can
9489 : access it with a single non-load/store instruction. */
9490 1946884 : if (DECL_P (base)
9491 1500150 : && !is_global_var (base)
9492 1424301 : && !TREE_ADDRESSABLE (base)
9493 2495859 : && DECL_MODE (base) != BLKmode)
9494 : kind = scalar_stmt;
9495 1803660 : else if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt)))
9496 : kind = scalar_load;
9497 : else
9498 1578508 : kind = scalar_store;
9499 : }
9500 : else
9501 : kind = scalar_stmt;
9502 : /* Cost each scalar stmt only once. */
9503 2120495 : gimple_set_visited (stmt->stmt, true);
9504 2120495 : record_stmt_cost (cost_vec, 1, kind, stmt, NULL_TREE, 0, vect_body);
9505 : }
9506 :
9507 : /* Now walk relevant parts of the SSA use-def graph. */
9508 2150310 : slp_oprnds child_ops (stmt);
9509 4509517 : for (unsigned i = 0; i < child_ops.num_slp_children; ++i)
9510 : {
9511 2359207 : tree op = child_ops.get_op_for_slp_child (stmt, i);
9512 2359207 : stmt_vec_info def = vinfo->lookup_def (op);
9513 2359207 : if (def && !visited.add (def))
9514 688266 : worklist.safe_push (def);
9515 : }
9516 : }
9517 673512 : }
9518 :
9519 :
9520 : /* Comparator for the loop-index sorted cost vectors. */
9521 :
9522 : static int
9523 17328373 : li_cost_vec_cmp (const void *a_, const void *b_)
9524 : {
9525 17328373 : auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
9526 17328373 : auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
9527 17328373 : if (a->first < b->first)
9528 : return -1;
9529 16588493 : else if (a->first == b->first)
9530 15947238 : return 0;
9531 : return 1;
9532 : }
9533 :
9534 : /* Check if vectorization of the basic block is profitable for the
9535 : subgraph denoted by SLP_INSTANCES. */
9536 :
9537 : static bool
9538 654192 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
9539 : vec<slp_instance> slp_instances,
9540 : loop_p orig_loop)
9541 : {
9542 654192 : slp_instance instance;
9543 654192 : int i;
9544 654192 : unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
9545 654192 : unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
9546 :
9547 654192 : if (dump_enabled_p ())
9548 : {
9549 99 : dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
9550 99 : hash_set<slp_tree> visited;
9551 399 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9552 102 : vect_print_slp_graph (MSG_NOTE, vect_location,
9553 : SLP_INSTANCE_TREE (instance), visited);
9554 99 : }
9555 :
9556 : /* Then DFS walk scalar stmts, performing costing and handling
9557 : still live scalar stmts via the previously computed vector coverage. */
9558 654192 : stmt_vector_for_cost scalar_costs = vNULL;
9559 654192 : stmt_vector_for_cost vector_costs = vNULL;
9560 654192 : hash_set<slp_tree> visited;
9561 654192 : hash_set<stmt_vec_info> svisited;
9562 1327704 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9563 : {
9564 673512 : auto_vec<stmt_vec_info> worklist;
9565 673512 : if (SLP_INSTANCE_ROOT_STMTS (instance).exists ())
9566 57138 : record_stmt_cost (&scalar_costs,
9567 28569 : SLP_INSTANCE_ROOT_STMTS (instance).length (),
9568 : scalar_stmt,
9569 28569 : SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
9570 3792390 : for (auto stmt : SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)))
9571 : {
9572 1771854 : stmt = vect_orig_stmt (stmt);
9573 1771854 : if (!svisited.add (stmt))
9574 1763611 : worklist.safe_push (stmt);
9575 : }
9576 673512 : vect_bb_slp_scalar_cost (bb_vinfo, worklist, &scalar_costs, svisited);
9577 673512 : vector_costs.safe_splice (instance->cost_vec);
9578 673512 : instance->cost_vec.release ();
9579 673512 : }
9580 :
9581 654192 : if (dump_enabled_p ())
9582 99 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
9583 :
9584 : /* When costing non-loop vectorization we need to consider each covered
9585 : loop independently and make sure vectorization is profitable. For
9586 : now we assume a loop may be not entered or executed an arbitrary
9587 : number of iterations (??? static information can provide more
9588 : precise info here) which means we can simply cost each containing
9589 : loops stmts separately. */
9590 :
9591 : /* First produce cost vectors sorted by loop index. */
9592 654192 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9593 654192 : li_scalar_costs (scalar_costs.length ());
9594 654192 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9595 654192 : li_vector_costs (vector_costs.length ());
9596 654192 : stmt_info_for_cost *cost;
9597 2803256 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9598 : {
9599 2149064 : unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9600 2149064 : li_scalar_costs.quick_push (std::make_pair (l, cost));
9601 : }
9602 : /* Use a random used loop as fallback in case the first vector_costs
9603 : entry does not have a stmt_info associated with it. */
9604 654192 : unsigned l = li_scalar_costs[0].first;
9605 2386216 : FOR_EACH_VEC_ELT (vector_costs, i, cost)
9606 : {
9607 : /* We inherit from the previous COST, invariants, externals and
9608 : extracts immediately follow the cost for the related stmt. */
9609 1732024 : if (cost->stmt_info)
9610 1017044 : l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9611 1732024 : li_vector_costs.quick_push (std::make_pair (l, cost));
9612 : }
9613 654192 : li_scalar_costs.qsort (li_cost_vec_cmp);
9614 654192 : li_vector_costs.qsort (li_cost_vec_cmp);
9615 :
9616 : /* Now cost the portions individually. */
9617 : unsigned vi = 0;
9618 : unsigned si = 0;
9619 1135720 : bool profitable = true;
9620 1135720 : while (si < li_scalar_costs.length ()
9621 1794504 : && vi < li_vector_costs.length ())
9622 : {
9623 658772 : unsigned sl = li_scalar_costs[si].first;
9624 658772 : unsigned vl = li_vector_costs[vi].first;
9625 658772 : if (sl != vl)
9626 : {
9627 1026 : if (dump_enabled_p ())
9628 0 : dump_printf_loc (MSG_NOTE, vect_location,
9629 : "Scalar %d and vector %d loop part do not "
9630 : "match up, skipping scalar part\n", sl, vl);
9631 : /* Skip the scalar part, assuming zero cost on the vector side. */
9632 1694 : do
9633 : {
9634 1694 : si++;
9635 : }
9636 1694 : while (si < li_scalar_costs.length ()
9637 3497 : && li_scalar_costs[si].first == sl);
9638 1026 : continue;
9639 : }
9640 :
9641 657746 : class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
9642 2129981 : do
9643 : {
9644 2129981 : add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
9645 2129981 : si++;
9646 : }
9647 2129981 : while (si < li_scalar_costs.length ()
9648 4267423 : && li_scalar_costs[si].first == sl);
9649 657746 : scalar_target_cost_data->finish_cost (nullptr);
9650 657746 : scalar_cost = scalar_target_cost_data->body_cost ();
9651 :
9652 : /* Complete the target-specific vector cost calculation. */
9653 657746 : class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
9654 1704799 : do
9655 : {
9656 1704799 : add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
9657 1704799 : vi++;
9658 : }
9659 1704799 : while (vi < li_vector_costs.length ()
9660 3418199 : && li_vector_costs[vi].first == vl);
9661 657746 : vect_target_cost_data->finish_cost (scalar_target_cost_data);
9662 657746 : vec_prologue_cost = vect_target_cost_data->prologue_cost ();
9663 657746 : vec_inside_cost = vect_target_cost_data->body_cost ();
9664 657746 : vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
9665 657746 : delete scalar_target_cost_data;
9666 657746 : delete vect_target_cost_data;
9667 :
9668 657746 : vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
9669 :
9670 657746 : if (dump_enabled_p ())
9671 : {
9672 99 : dump_printf_loc (MSG_NOTE, vect_location,
9673 : "Cost model analysis for part in loop %d:\n", sl);
9674 99 : dump_printf (MSG_NOTE, " Vector cost: %d\n",
9675 : vec_inside_cost + vec_outside_cost);
9676 99 : dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
9677 : }
9678 :
9679 : /* Vectorization is profitable if its cost is more than the cost of scalar
9680 : version. Note that we err on the vector side for equal cost because
9681 : the cost estimate is otherwise quite pessimistic (constant uses are
9682 : free on the scalar side but cost a load on the vector side for
9683 : example). */
9684 657746 : if (vec_outside_cost + vec_inside_cost > scalar_cost)
9685 : {
9686 : profitable = false;
9687 : break;
9688 : }
9689 : }
9690 1131127 : if (profitable && vi < li_vector_costs.length ())
9691 : {
9692 1084 : if (dump_enabled_p ())
9693 12 : dump_printf_loc (MSG_NOTE, vect_location,
9694 : "Excess vector cost for part in loop %d:\n",
9695 6 : li_vector_costs[vi].first);
9696 : profitable = false;
9697 : }
9698 :
9699 : /* Unset visited flag. This is delayed when the subgraph is profitable
9700 : and we process the loop for remaining unvectorized if-converted code. */
9701 654192 : if (!orig_loop || !profitable)
9702 2801837 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9703 2147748 : gimple_set_visited (cost->stmt_info->stmt, false);
9704 :
9705 654192 : scalar_costs.release ();
9706 654192 : vector_costs.release ();
9707 :
9708 654192 : return profitable;
9709 654192 : }
9710 :
9711 : /* qsort comparator for lane defs. */
9712 :
9713 : static int
9714 40 : vld_cmp (const void *a_, const void *b_)
9715 : {
9716 40 : auto *a = (const std::pair<unsigned, tree> *)a_;
9717 40 : auto *b = (const std::pair<unsigned, tree> *)b_;
9718 40 : return a->first - b->first;
9719 : }
9720 :
9721 : /* Return true if USE_STMT is a vector lane insert into VEC and set
9722 : *THIS_LANE to the lane number that is set. */
9723 :
9724 : static bool
9725 248 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
9726 : {
9727 248 : gassign *use_ass = dyn_cast <gassign *> (use_stmt);
9728 91 : if (!use_ass
9729 91 : || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
9730 22 : || (vec
9731 22 : ? gimple_assign_rhs1 (use_ass) != vec
9732 24 : : ((vec = gimple_assign_rhs1 (use_ass)), false))
9733 46 : || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
9734 46 : TREE_TYPE (gimple_assign_rhs2 (use_ass)))
9735 46 : || !constant_multiple_p
9736 46 : (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
9737 92 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
9738 : this_lane))
9739 202 : return false;
9740 : return true;
9741 : }
9742 :
9743 : /* Find any vectorizable constructors and add them to the grouped_store
9744 : array. */
9745 :
9746 : static void
9747 2190696 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
9748 : {
9749 17515250 : for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
9750 30649108 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
9751 134374416 : !gsi_end_p (gsi); gsi_next (&gsi))
9752 : {
9753 119049862 : gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
9754 : /* This can be used to start SLP discovery for early breaks for BB early breaks
9755 : when we get that far. */
9756 119049862 : if (!assign)
9757 178649264 : continue;
9758 :
9759 30641742 : tree rhs = gimple_assign_rhs1 (assign);
9760 30641742 : enum tree_code code = gimple_assign_rhs_code (assign);
9761 30641742 : use_operand_p use_p;
9762 30641742 : gimple *use_stmt;
9763 30641742 : if (code == CONSTRUCTOR)
9764 : {
9765 1566051 : if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9766 63519 : || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
9767 92549 : CONSTRUCTOR_NELTS (rhs))
9768 42885 : || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
9769 1608932 : || uniform_vector_p (rhs))
9770 1553229 : continue;
9771 :
9772 : unsigned j;
9773 : tree val;
9774 63555 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9775 50733 : if (TREE_CODE (val) != SSA_NAME
9776 50733 : || !bb_vinfo->lookup_def (val))
9777 : break;
9778 31676 : if (j != CONSTRUCTOR_NELTS (rhs))
9779 3016 : continue;
9780 :
9781 12822 : vec<stmt_vec_info> roots = vNULL;
9782 12822 : roots.safe_push (bb_vinfo->lookup_stmt (assign));
9783 12822 : vec<stmt_vec_info> stmts;
9784 12822 : stmts.create (CONSTRUCTOR_NELTS (rhs));
9785 71720 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9786 46076 : stmts.quick_push
9787 46076 : (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
9788 12822 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9789 12822 : stmts, roots));
9790 : }
9791 29075691 : else if (code == BIT_INSERT_EXPR
9792 929 : && VECTOR_TYPE_P (TREE_TYPE (rhs))
9793 611 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
9794 611 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
9795 608 : && integer_zerop (gimple_assign_rhs3 (assign))
9796 341 : && useless_type_conversion_p
9797 341 : (TREE_TYPE (TREE_TYPE (rhs)),
9798 341 : TREE_TYPE (gimple_assign_rhs2 (assign)))
9799 29076313 : && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
9800 : {
9801 : /* We start to match on insert to lane zero but since the
9802 : inserts need not be ordered we'd have to search both
9803 : the def and the use chains. */
9804 215 : tree vectype = TREE_TYPE (rhs);
9805 215 : unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
9806 215 : auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
9807 215 : auto_sbitmap lanes (nlanes);
9808 215 : bitmap_clear (lanes);
9809 215 : bitmap_set_bit (lanes, 0);
9810 215 : tree def = gimple_assign_lhs (assign);
9811 215 : lane_defs.quick_push
9812 215 : (std::make_pair (0, gimple_assign_rhs2 (assign)));
9813 215 : unsigned lanes_found = 1;
9814 : /* Start with the use chains, the last stmt will be the root. */
9815 215 : stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
9816 215 : vec<stmt_vec_info> roots = vNULL;
9817 215 : roots.safe_push (last);
9818 217 : do
9819 : {
9820 217 : use_operand_p use_p;
9821 217 : gimple *use_stmt;
9822 217 : if (!single_imm_use (def, &use_p, &use_stmt))
9823 : break;
9824 211 : unsigned this_lane;
9825 211 : if (!bb_vinfo->lookup_stmt (use_stmt)
9826 211 : || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
9827 233 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
9828 : break;
9829 22 : if (bitmap_bit_p (lanes, this_lane))
9830 : break;
9831 2 : lanes_found++;
9832 2 : bitmap_set_bit (lanes, this_lane);
9833 2 : gassign *use_ass = as_a <gassign *> (use_stmt);
9834 2 : lane_defs.quick_push (std::make_pair
9835 2 : (this_lane, gimple_assign_rhs2 (use_ass)));
9836 2 : last = bb_vinfo->lookup_stmt (use_ass);
9837 2 : roots.safe_push (last);
9838 2 : def = gimple_assign_lhs (use_ass);
9839 : }
9840 2 : while (lanes_found < nlanes);
9841 215 : if (roots.length () > 1)
9842 2 : std::swap(roots[0], roots[roots.length () - 1]);
9843 215 : if (lanes_found < nlanes)
9844 : {
9845 : /* Now search the def chain. */
9846 215 : def = gimple_assign_rhs1 (assign);
9847 217 : do
9848 : {
9849 217 : if (TREE_CODE (def) != SSA_NAME
9850 217 : || !has_single_use (def))
9851 : break;
9852 56 : gimple *def_stmt = SSA_NAME_DEF_STMT (def);
9853 56 : unsigned this_lane;
9854 56 : if (!bb_vinfo->lookup_stmt (def_stmt)
9855 37 : || !vect_slp_is_lane_insert (def_stmt,
9856 : NULL_TREE, &this_lane)
9857 80 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
9858 : break;
9859 24 : if (bitmap_bit_p (lanes, this_lane))
9860 : break;
9861 4 : lanes_found++;
9862 4 : bitmap_set_bit (lanes, this_lane);
9863 8 : lane_defs.quick_push (std::make_pair
9864 4 : (this_lane,
9865 4 : gimple_assign_rhs2 (def_stmt)));
9866 4 : roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
9867 4 : def = gimple_assign_rhs1 (def_stmt);
9868 : }
9869 4 : while (lanes_found < nlanes);
9870 : }
9871 215 : if (lanes_found == nlanes)
9872 : {
9873 : /* Sort lane_defs after the lane index and register the root. */
9874 2 : lane_defs.qsort (vld_cmp);
9875 2 : vec<stmt_vec_info> stmts;
9876 2 : stmts.create (nlanes);
9877 10 : for (unsigned i = 0; i < nlanes; ++i)
9878 8 : stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
9879 2 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9880 2 : stmts, roots));
9881 : }
9882 : else
9883 213 : roots.release ();
9884 215 : }
9885 29075476 : else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9886 28095926 : && (associative_tree_code (code) || code == MINUS_EXPR)
9887 : /* ??? This pessimizes a two-element reduction. PR54400.
9888 : ??? In-order reduction could be handled if we only
9889 : traverse one operand chain in vect_slp_linearize_chain. */
9890 32969010 : && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
9891 : /* Ops with constants at the tail can be stripped here. */
9892 5741956 : && TREE_CODE (rhs) == SSA_NAME
9893 5686394 : && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
9894 : /* Should be the chain end. */
9895 31330112 : && (!single_imm_use (gimple_assign_lhs (assign),
9896 : &use_p, &use_stmt)
9897 1743661 : || !is_gimple_assign (use_stmt)
9898 1193890 : || (gimple_assign_rhs_code (use_stmt) != code
9899 884705 : && ((code != PLUS_EXPR && code != MINUS_EXPR)
9900 498929 : || (gimple_assign_rhs_code (use_stmt)
9901 498929 : != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
9902 : {
9903 : /* We start the match at the end of a possible association
9904 : chain. */
9905 1848422 : auto_vec<chain_op_t> chain;
9906 1848422 : auto_vec<std::pair<tree_code, gimple *> > worklist;
9907 1848422 : auto_vec<gimple *> chain_stmts;
9908 1848422 : gimple *code_stmt = NULL, *alt_code_stmt = NULL;
9909 1848422 : if (code == MINUS_EXPR)
9910 297670 : code = PLUS_EXPR;
9911 1848422 : internal_fn reduc_fn;
9912 2128217 : if (!reduction_fn_for_scalar_code (code, &reduc_fn)
9913 1848422 : || reduc_fn == IFN_LAST)
9914 279795 : continue;
9915 1568627 : vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
9916 : /* ??? */
9917 : code_stmt, alt_code_stmt, &chain_stmts,
9918 : false);
9919 3137254 : if (chain.length () > 1)
9920 : {
9921 : /* Sort the chain according to def_type and operation. */
9922 1568627 : chain.sort (dt_sort_cmp, bb_vinfo);
9923 : /* ??? Now we'd want to strip externals and constants
9924 : but record those to be handled in the epilogue. */
9925 : /* ??? For now do not allow mixing ops or externs/constants. */
9926 1568627 : bool invalid = false;
9927 1568627 : unsigned remain_cnt = 0;
9928 1568627 : unsigned last_idx = 0;
9929 4742725 : for (unsigned i = 0; i < chain.length (); ++i)
9930 : {
9931 3471768 : if (chain[i].code != code)
9932 : {
9933 : invalid = true;
9934 : break;
9935 : }
9936 3174098 : if (chain[i].dt != vect_internal_def
9937 : /* Avoid stmts where the def is not the LHS, like
9938 : ASMs. */
9939 6137650 : || (gimple_get_lhs (bb_vinfo->lookup_def
9940 2963552 : (chain[i].op)->stmt)
9941 2963552 : != chain[i].op))
9942 213490 : remain_cnt++;
9943 : else
9944 : last_idx = i;
9945 : }
9946 : /* Make sure to have an even number of lanes as we later do
9947 : all-or-nothing discovery, not trying to split further. */
9948 1568627 : if ((chain.length () - remain_cnt) & 1)
9949 168627 : remain_cnt++;
9950 1568627 : if (!invalid && chain.length () - remain_cnt > 1)
9951 : {
9952 1206407 : vec<stmt_vec_info> stmts;
9953 1206407 : vec<tree> remain = vNULL;
9954 1206407 : stmts.create (chain.length ());
9955 1206407 : if (remain_cnt > 0)
9956 114594 : remain.create (remain_cnt);
9957 3879226 : for (unsigned i = 0; i < chain.length (); ++i)
9958 : {
9959 2672819 : stmt_vec_info stmt_info;
9960 2672819 : if (chain[i].dt == vect_internal_def
9961 2633089 : && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
9962 2633089 : gimple_get_lhs (stmt_info->stmt) == chain[i].op)
9963 5305824 : && (i != last_idx
9964 1206407 : || (stmts.length () & 1)))
9965 2546868 : stmts.quick_push (stmt_info);
9966 : else
9967 125951 : remain.quick_push (chain[i].op);
9968 : }
9969 1206407 : vec<stmt_vec_info> roots;
9970 1206407 : roots.create (chain_stmts.length ());
9971 2672819 : for (unsigned i = 0; i < chain_stmts.length (); ++i)
9972 1466412 : roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
9973 1206407 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
9974 1206407 : stmts, roots, remain));
9975 : }
9976 : }
9977 1848422 : }
9978 : }
9979 2190696 : }
9980 :
9981 : /* Walk the grouped store chains and replace entries with their
9982 : pattern variant if any. */
9983 :
9984 : static void
9985 610100 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
9986 : {
9987 610100 : stmt_vec_info first_element;
9988 610100 : unsigned i;
9989 :
9990 1495260 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
9991 : {
9992 : /* We also have CTORs in this array. */
9993 885160 : if (!STMT_VINFO_GROUPED_ACCESS (first_element))
9994 0 : continue;
9995 885160 : if (STMT_VINFO_IN_PATTERN_P (first_element))
9996 : {
9997 252 : stmt_vec_info orig = first_element;
9998 252 : first_element = STMT_VINFO_RELATED_STMT (first_element);
9999 252 : DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
10000 252 : DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
10001 252 : DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
10002 252 : DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
10003 252 : vinfo->grouped_stores[i] = first_element;
10004 : }
10005 885160 : stmt_vec_info prev = first_element;
10006 2488431 : while (DR_GROUP_NEXT_ELEMENT (prev))
10007 : {
10008 1603271 : stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
10009 1603271 : if (STMT_VINFO_IN_PATTERN_P (elt))
10010 : {
10011 849 : stmt_vec_info orig = elt;
10012 849 : elt = STMT_VINFO_RELATED_STMT (elt);
10013 849 : DR_GROUP_NEXT_ELEMENT (prev) = elt;
10014 849 : DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
10015 849 : DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
10016 : }
10017 1603271 : DR_GROUP_FIRST_ELEMENT (elt) = first_element;
10018 1603271 : prev = elt;
10019 : }
10020 : }
10021 610100 : }
10022 :
10023 : /* Check if the region described by BB_VINFO can be vectorized, returning
10024 : true if so. When returning false, set FATAL to true if the same failure
10025 : would prevent vectorization at other vector sizes, false if it is still
10026 : worth trying other sizes. N_STMTS is the number of statements in the
10027 : region. */
10028 :
10029 : static bool
10030 2190696 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
10031 : vec<int> *dataref_groups)
10032 : {
10033 2190696 : DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
10034 :
10035 2190696 : slp_instance instance;
10036 2190696 : int i;
10037 :
10038 : /* The first group of checks is independent of the vector size. */
10039 2190696 : fatal = true;
10040 :
10041 : /* Analyze the data references. */
10042 :
10043 2190696 : if (!vect_analyze_data_refs (bb_vinfo, NULL))
10044 : {
10045 0 : if (dump_enabled_p ())
10046 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10047 : "not vectorized: unhandled data-ref in basic "
10048 : "block.\n");
10049 0 : return false;
10050 : }
10051 :
10052 2190696 : if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
10053 : {
10054 0 : if (dump_enabled_p ())
10055 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10056 : "not vectorized: unhandled data access in "
10057 : "basic block.\n");
10058 0 : return false;
10059 : }
10060 :
10061 2190696 : vect_slp_check_for_roots (bb_vinfo);
10062 :
10063 : /* If there are no grouped stores and no constructors in the region
10064 : there is no need to continue with pattern recog as vect_analyze_slp
10065 : will fail anyway. */
10066 2190696 : if (bb_vinfo->grouped_stores.is_empty ()
10067 1849545 : && bb_vinfo->roots.is_empty ())
10068 : {
10069 1580596 : if (dump_enabled_p ())
10070 1022 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10071 : "not vectorized: no grouped stores in "
10072 : "basic block.\n");
10073 1580596 : return false;
10074 : }
10075 :
10076 : /* While the rest of the analysis below depends on it in some way. */
10077 610100 : fatal = false;
10078 :
10079 610100 : vect_pattern_recog (bb_vinfo);
10080 :
10081 : /* Update store groups from pattern processing. */
10082 610100 : vect_fixup_store_groups_with_patterns (bb_vinfo);
10083 :
10084 : /* Check the SLP opportunities in the basic block, analyze and build SLP
10085 : trees. */
10086 610100 : if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
10087 : {
10088 0 : if (dump_enabled_p ())
10089 : {
10090 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10091 : "Failed to SLP the basic block.\n");
10092 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10093 : "not vectorized: failed to find SLP opportunities "
10094 : "in basic block.\n");
10095 : }
10096 0 : return false;
10097 : }
10098 :
10099 : /* Optimize permutations. */
10100 610100 : vect_optimize_slp (bb_vinfo);
10101 :
10102 : /* Gather the loads reachable from the SLP graph entries. */
10103 610100 : vect_gather_slp_loads (bb_vinfo);
10104 :
10105 610100 : vect_record_base_alignments (bb_vinfo);
10106 :
10107 : /* Analyze and verify the alignment of data references and the
10108 : dependence in the SLP instances. */
10109 1395190 : for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
10110 : {
10111 785090 : vect_location = instance->location ();
10112 785090 : if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
10113 785090 : || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
10114 : {
10115 8519 : slp_tree node = SLP_INSTANCE_TREE (instance);
10116 8519 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
10117 8519 : if (dump_enabled_p ())
10118 4 : dump_printf_loc (MSG_NOTE, vect_location,
10119 : "removing SLP instance operations starting from: %G",
10120 : stmt_info->stmt);
10121 8519 : vect_free_slp_instance (instance);
10122 8519 : BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
10123 8519 : continue;
10124 8519 : }
10125 :
10126 : /* Mark all the statements that we want to vectorize as relevant. */
10127 776571 : vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
10128 :
10129 776571 : i++;
10130 : }
10131 2220485 : if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
10132 : return false;
10133 :
10134 264393 : if (!vect_slp_analyze_operations (bb_vinfo))
10135 : {
10136 29789 : if (dump_enabled_p ())
10137 87 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10138 : "not vectorized: bad operation in basic block.\n");
10139 29789 : return false;
10140 : }
10141 :
10142 : /* Mark all the statements that we vectorize. */
10143 234604 : vect_bb_slp_mark_stmts_vectorized (bb_vinfo);
10144 :
10145 : /* Compute vectorizable live stmts. */
10146 234604 : vect_bb_slp_mark_live_stmts (bb_vinfo);
10147 :
10148 234604 : vect_bb_partition_graph (bb_vinfo);
10149 :
10150 234604 : return true;
10151 : }
10152 :
10153 : /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
10154 : basic blocks in BBS, returning true on success.
10155 : The region has N_STMTS statements and has the datarefs given by DATAREFS. */
10156 :
10157 : static bool
10158 1871569 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
10159 : vec<int> *dataref_groups, unsigned int n_stmts,
10160 : loop_p orig_loop)
10161 : {
10162 1871569 : bb_vec_info bb_vinfo;
10163 1871569 : auto_vector_modes vector_modes;
10164 :
10165 : /* Autodetect first vector size we try. */
10166 1871569 : machine_mode next_vector_mode = VOIDmode;
10167 1871569 : targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
10168 1871569 : unsigned int mode_i = 0;
10169 :
10170 1871569 : vec_info_shared shared;
10171 :
10172 1871569 : machine_mode autodetected_vector_mode = VOIDmode;
10173 2509823 : while (1)
10174 : {
10175 2190696 : bool vectorized = false;
10176 2190696 : bool fatal = false;
10177 2190696 : bb_vinfo = new _bb_vec_info (bbs, &shared);
10178 :
10179 2190696 : bool first_time_p = shared.datarefs.is_empty ();
10180 2190696 : BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
10181 2190696 : if (first_time_p)
10182 1894098 : bb_vinfo->shared->save_datarefs ();
10183 : else
10184 296598 : bb_vinfo->shared->check_datarefs ();
10185 2190696 : bb_vinfo->vector_mode = next_vector_mode;
10186 :
10187 2190696 : if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
10188 : {
10189 234604 : if (dump_enabled_p ())
10190 : {
10191 1508 : dump_printf_loc (MSG_NOTE, vect_location,
10192 : "***** Analysis succeeded with vector mode"
10193 754 : " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
10194 754 : dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
10195 : }
10196 :
10197 234604 : bb_vinfo->shared->check_datarefs ();
10198 :
10199 234604 : bool force_clear = false;
10200 234604 : auto_vec<slp_instance> profitable_subgraphs;
10201 1380822 : for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
10202 : {
10203 677010 : if (instance->subgraph_entries.is_empty ())
10204 217242 : continue;
10205 :
10206 657553 : dump_user_location_t saved_vect_location = vect_location;
10207 657553 : vect_location = instance->location ();
10208 657553 : if (!unlimited_cost_model (NULL)
10209 654197 : && !param_vect_allow_possibly_not_worthwhile_vectorizations
10210 1311745 : && !vect_bb_vectorization_profitable_p
10211 654192 : (bb_vinfo, instance->subgraph_entries, orig_loop))
10212 : {
10213 178328 : if (dump_enabled_p ())
10214 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10215 : "not vectorized: vectorization is not "
10216 : "profitable.\n");
10217 178328 : vect_location = saved_vect_location;
10218 178328 : continue;
10219 : }
10220 :
10221 479225 : vect_location = saved_vect_location;
10222 479225 : if (!dbg_cnt (vect_slp))
10223 : {
10224 0 : force_clear = true;
10225 0 : continue;
10226 : }
10227 :
10228 479225 : profitable_subgraphs.safe_push (instance);
10229 : }
10230 :
10231 : /* When we're vectorizing an if-converted loop body make sure
10232 : we vectorized all if-converted code. */
10233 393486 : if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
10234 : {
10235 106 : gcc_assert (bb_vinfo->nbbs == 1);
10236 212 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
10237 4388 : !gsi_end_p (gsi); gsi_next (&gsi))
10238 : {
10239 : /* The costing above left us with DCEable vectorized scalar
10240 : stmts having the visited flag set on profitable
10241 : subgraphs. Do the delayed clearing of the flag here. */
10242 4282 : if (gimple_visited_p (gsi_stmt (gsi)))
10243 : {
10244 1260 : gimple_set_visited (gsi_stmt (gsi), false);
10245 1260 : continue;
10246 : }
10247 3022 : if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
10248 813 : continue;
10249 :
10250 6334 : if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
10251 2666 : if (gimple_assign_rhs_code (ass) == COND_EXPR)
10252 : {
10253 69 : if (!profitable_subgraphs.is_empty ()
10254 31 : && dump_enabled_p ())
10255 0 : dump_printf_loc (MSG_NOTE, vect_location,
10256 : "not profitable because of "
10257 : "unprofitable if-converted scalar "
10258 : "code\n");
10259 38 : profitable_subgraphs.truncate (0);
10260 : }
10261 : }
10262 : }
10263 :
10264 : /* Finally schedule the profitable subgraphs. */
10265 1031547 : for (slp_instance instance : profitable_subgraphs)
10266 : {
10267 479179 : if (!vectorized && dump_enabled_p ())
10268 729 : dump_printf_loc (MSG_NOTE, vect_location,
10269 : "Basic block will be vectorized "
10270 : "using SLP\n");
10271 479179 : vectorized = true;
10272 :
10273 : /* Dump before scheduling as store vectorization will remove
10274 : the original stores and mess with the instance tree
10275 : so querying its location will eventually ICE. */
10276 479179 : if (flag_checking)
10277 1927295 : for (slp_instance sub : instance->subgraph_entries)
10278 489758 : gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
10279 479179 : unsigned HOST_WIDE_INT bytes;
10280 479179 : if (dump_enabled_p ())
10281 3469 : for (slp_instance sub : instance->subgraph_entries)
10282 : {
10283 919 : tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
10284 1838 : if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
10285 919 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
10286 919 : sub->location (),
10287 : "basic block part vectorized using %wu "
10288 : "byte vectors\n", bytes);
10289 : else
10290 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
10291 : sub->location (),
10292 : "basic block part vectorized using "
10293 : "variable length vectors\n");
10294 : }
10295 :
10296 479179 : dump_user_location_t saved_vect_location = vect_location;
10297 479179 : vect_location = instance->location ();
10298 :
10299 479179 : vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
10300 :
10301 479179 : vect_location = saved_vect_location;
10302 : }
10303 :
10304 :
10305 : /* Generate the invariant statements. */
10306 234604 : if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
10307 : {
10308 23 : if (dump_enabled_p ())
10309 0 : dump_printf_loc (MSG_NOTE, vect_location,
10310 : "------>generating invariant statements\n");
10311 :
10312 23 : bb_vinfo->insert_seq_on_entry (NULL,
10313 : bb_vinfo->inv_pattern_def_seq);
10314 : }
10315 234604 : }
10316 : else
10317 : {
10318 1956092 : if (dump_enabled_p ())
10319 1314 : dump_printf_loc (MSG_NOTE, vect_location,
10320 : "***** Analysis failed with vector mode %s\n",
10321 1314 : GET_MODE_NAME (bb_vinfo->vector_mode));
10322 : }
10323 :
10324 2190696 : if (mode_i == 0)
10325 1871569 : autodetected_vector_mode = bb_vinfo->vector_mode;
10326 :
10327 2190696 : if (!fatal)
10328 3134089 : while (mode_i < vector_modes.length ()
10329 1754619 : && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
10330 : {
10331 333293 : if (dump_enabled_p ())
10332 1660 : dump_printf_loc (MSG_NOTE, vect_location,
10333 : "***** The result for vector mode %s would"
10334 : " be the same\n",
10335 830 : GET_MODE_NAME (vector_modes[mode_i]));
10336 333293 : mode_i += 1;
10337 : }
10338 :
10339 2190696 : delete bb_vinfo;
10340 :
10341 2190696 : if (mode_i < vector_modes.length ()
10342 2014175 : && VECTOR_MODE_P (autodetected_vector_mode)
10343 1995328 : && (related_vector_mode (vector_modes[mode_i],
10344 : GET_MODE_INNER (autodetected_vector_mode))
10345 997664 : == autodetected_vector_mode)
10346 4204871 : && (related_vector_mode (autodetected_vector_mode,
10347 519273 : GET_MODE_INNER (vector_modes[mode_i]))
10348 1038546 : == vector_modes[mode_i]))
10349 : {
10350 519273 : if (dump_enabled_p ())
10351 205 : dump_printf_loc (MSG_NOTE, vect_location,
10352 : "***** Skipping vector mode %s, which would"
10353 : " repeat the analysis for %s\n",
10354 205 : GET_MODE_NAME (vector_modes[mode_i]),
10355 205 : GET_MODE_NAME (autodetected_vector_mode));
10356 519273 : mode_i += 1;
10357 : }
10358 :
10359 2190696 : if (vectorized
10360 2031845 : || mode_i == vector_modes.length ()
10361 1855369 : || autodetected_vector_mode == VOIDmode
10362 : /* If vect_slp_analyze_bb_1 signaled that analysis for all
10363 : vector sizes will fail do not bother iterating. */
10364 3029554 : || fatal)
10365 3743138 : return vectorized;
10366 :
10367 : /* Try the next biggest vector size. */
10368 319127 : next_vector_mode = vector_modes[mode_i++];
10369 319127 : if (dump_enabled_p ())
10370 218 : dump_printf_loc (MSG_NOTE, vect_location,
10371 : "***** Re-trying analysis with vector mode %s\n",
10372 218 : GET_MODE_NAME (next_vector_mode));
10373 319127 : }
10374 1871569 : }
10375 :
10376 :
10377 : /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
10378 : true if anything in the basic-block was vectorized. */
10379 :
10380 : static bool
10381 1871569 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
10382 : {
10383 1871569 : vec<data_reference_p> datarefs = vNULL;
10384 1871569 : auto_vec<int> dataref_groups;
10385 1871569 : int insns = 0;
10386 1871569 : int current_group = 0;
10387 :
10388 12392371 : for (unsigned i = 0; i < bbs.length (); i++)
10389 : {
10390 10520802 : basic_block bb = bbs[i];
10391 88140584 : for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
10392 77619782 : gsi_next (&gsi))
10393 : {
10394 77619782 : gimple *stmt = gsi_stmt (gsi);
10395 77619782 : if (is_gimple_debug (stmt))
10396 48154919 : continue;
10397 :
10398 29464863 : insns++;
10399 :
10400 29464863 : if (gimple_location (stmt) != UNKNOWN_LOCATION)
10401 26432853 : vect_location = stmt;
10402 :
10403 29464863 : if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
10404 : &dataref_groups, current_group))
10405 5078363 : ++current_group;
10406 : }
10407 : /* New BBs always start a new DR group. */
10408 10520802 : ++current_group;
10409 : }
10410 :
10411 1871569 : return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
10412 1871569 : }
10413 :
10414 : /* Special entry for the BB vectorizer. Analyze and transform a single
10415 : if-converted BB with ORIG_LOOPs body being the not if-converted
10416 : representation. Returns true if anything in the basic-block was
10417 : vectorized. */
10418 :
10419 : bool
10420 19219 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
10421 : {
10422 19219 : auto_vec<basic_block> bbs;
10423 19219 : bbs.safe_push (bb);
10424 19219 : return vect_slp_bbs (bbs, orig_loop);
10425 19219 : }
10426 :
10427 : /* Main entry for the BB vectorizer. Analyze and transform BB, returns
10428 : true if anything in the basic-block was vectorized. */
10429 :
10430 : bool
10431 906490 : vect_slp_function (function *fun)
10432 : {
10433 906490 : bool r = false;
10434 906490 : int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
10435 906490 : auto_bitmap exit_bbs;
10436 906490 : bitmap_set_bit (exit_bbs, EXIT_BLOCK);
10437 906490 : edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
10438 906490 : unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
10439 906490 : true, rpo, NULL);
10440 :
10441 : /* For the moment split the function into pieces to avoid making
10442 : the iteration on the vector mode moot. Split at points we know
10443 : to not handle well which is CFG merges (SLP discovery doesn't
10444 : handle non-loop-header PHIs) and loop exits. Since pattern
10445 : recog requires reverse iteration to visit uses before defs
10446 : simply chop RPO into pieces. */
10447 906490 : auto_vec<basic_block> bbs;
10448 11439017 : for (unsigned i = 0; i < n; i++)
10449 : {
10450 10532527 : basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
10451 10532527 : bool split = false;
10452 :
10453 : /* Split when a BB is not dominated by the first block. */
10454 19852247 : if (!bbs.is_empty ()
10455 9319720 : && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
10456 : {
10457 659411 : if (dump_enabled_p ())
10458 146 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10459 : "splitting region at dominance boundary bb%d\n",
10460 : bb->index);
10461 : split = true;
10462 : }
10463 : /* Split when the loop determined by the first block
10464 : is exited. This is because we eventually insert
10465 : invariants at region begin. */
10466 18533425 : else if (!bbs.is_empty ()
10467 8660309 : && bbs[0]->loop_father != bb->loop_father
10468 2259274 : && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
10469 : {
10470 3805 : if (dump_enabled_p ())
10471 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10472 : "splitting region at loop %d exit at bb%d\n",
10473 3 : bbs[0]->loop_father->num, bb->index);
10474 : split = true;
10475 : }
10476 9869311 : else if (!bbs.is_empty ()
10477 8656504 : && bb->loop_father->header == bb
10478 468363 : && bb->loop_father->dont_vectorize)
10479 : {
10480 7271 : if (dump_enabled_p ())
10481 72 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10482 : "splitting region at dont-vectorize loop %d "
10483 : "entry at bb%d\n",
10484 : bb->loop_father->num, bb->index);
10485 : split = true;
10486 : }
10487 :
10488 11203014 : if (split && !bbs.is_empty ())
10489 : {
10490 670487 : r |= vect_slp_bbs (bbs, NULL);
10491 670487 : bbs.truncate (0);
10492 : }
10493 :
10494 10532527 : if (bbs.is_empty ())
10495 : {
10496 : /* We need to be able to insert at the head of the region which
10497 : we cannot for region starting with a returns-twice call. */
10498 1883294 : if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
10499 400344 : if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
10500 : {
10501 301 : if (dump_enabled_p ())
10502 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10503 : "skipping bb%d as start of region as it "
10504 : "starts with returns-twice call\n",
10505 : bb->index);
10506 30944 : continue;
10507 : }
10508 : /* If the loop this BB belongs to is marked as not to be vectorized
10509 : honor that also for BB vectorization. */
10510 1882993 : if (bb->loop_father->dont_vectorize)
10511 30643 : continue;
10512 : }
10513 :
10514 10501583 : bbs.safe_push (bb);
10515 :
10516 : /* When we have a stmt ending this block and defining a
10517 : value we have to insert on edges when inserting after it for
10518 : a vector containing its definition. Avoid this for now. */
10519 21003166 : if (gimple *last = *gsi_last_bb (bb))
10520 8508410 : if (gimple_get_lhs (last)
10521 8508410 : && is_ctrl_altering_stmt (last))
10522 : {
10523 275380 : if (dump_enabled_p ())
10524 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10525 : "splitting region at control altering "
10526 : "definition %G", last);
10527 275380 : r |= vect_slp_bbs (bbs, NULL);
10528 275380 : bbs.truncate (0);
10529 : }
10530 : }
10531 :
10532 906490 : if (!bbs.is_empty ())
10533 906483 : r |= vect_slp_bbs (bbs, NULL);
10534 :
10535 906490 : free (rpo);
10536 :
10537 906490 : return r;
10538 906490 : }
10539 :
10540 : /* Build a variable-length vector in which the elements in ELTS are repeated
10541 : to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
10542 : RESULTS and add any new instructions to SEQ.
10543 :
10544 : The approach we use is:
10545 :
10546 : (1) Find a vector mode VM with integer elements of mode IM.
10547 :
10548 : (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10549 : ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
10550 : from small vectors to IM.
10551 :
10552 : (3) Duplicate each ELTS'[I] into a vector of mode VM.
10553 :
10554 : (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
10555 : correct byte contents.
10556 :
10557 : (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
10558 :
10559 : We try to find the largest IM for which this sequence works, in order
10560 : to cut down on the number of interleaves. */
10561 :
10562 : void
10563 0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
10564 : const vec<tree> &elts, unsigned int nresults,
10565 : vec<tree> &results)
10566 : {
10567 0 : unsigned int nelts = elts.length ();
10568 0 : tree element_type = TREE_TYPE (vector_type);
10569 :
10570 : /* (1) Find a vector mode VM with integer elements of mode IM. */
10571 0 : unsigned int nvectors = 1;
10572 0 : tree new_vector_type;
10573 0 : tree permutes[2];
10574 0 : if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
10575 : &nvectors, &new_vector_type,
10576 : permutes))
10577 0 : gcc_unreachable ();
10578 :
10579 : /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
10580 0 : unsigned int partial_nelts = nelts / nvectors;
10581 0 : tree partial_vector_type = build_vector_type (element_type, partial_nelts);
10582 :
10583 0 : tree_vector_builder partial_elts;
10584 0 : auto_vec<tree, 32> pieces (nvectors * 2);
10585 0 : pieces.quick_grow_cleared (nvectors * 2);
10586 0 : for (unsigned int i = 0; i < nvectors; ++i)
10587 : {
10588 : /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10589 : ELTS' has mode IM. */
10590 0 : partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
10591 0 : for (unsigned int j = 0; j < partial_nelts; ++j)
10592 0 : partial_elts.quick_push (elts[i * partial_nelts + j]);
10593 0 : tree t = gimple_build_vector (seq, &partial_elts);
10594 0 : t = gimple_build (seq, VIEW_CONVERT_EXPR,
10595 0 : TREE_TYPE (new_vector_type), t);
10596 :
10597 : /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
10598 0 : pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
10599 : }
10600 :
10601 : /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
10602 : correct byte contents.
10603 :
10604 : Conceptually, we need to repeat the following operation log2(nvectors)
10605 : times, where hi_start = nvectors / 2:
10606 :
10607 : out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
10608 : out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
10609 :
10610 : However, if each input repeats every N elements and the VF is
10611 : a multiple of N * 2, the HI result is the same as the LO result.
10612 : This will be true for the first N1 iterations of the outer loop,
10613 : followed by N2 iterations for which both the LO and HI results
10614 : are needed. I.e.:
10615 :
10616 : N1 + N2 = log2(nvectors)
10617 :
10618 : Each "N1 iteration" doubles the number of redundant vectors and the
10619 : effect of the process as a whole is to have a sequence of nvectors/2**N1
10620 : vectors that repeats 2**N1 times. Rather than generate these redundant
10621 : vectors, we halve the number of vectors for each N1 iteration. */
10622 : unsigned int in_start = 0;
10623 : unsigned int out_start = nvectors;
10624 : unsigned int new_nvectors = nvectors;
10625 0 : for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
10626 : {
10627 0 : unsigned int hi_start = new_nvectors / 2;
10628 0 : unsigned int out_i = 0;
10629 0 : for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
10630 : {
10631 0 : if ((in_i & 1) != 0
10632 0 : && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
10633 : 2 * in_repeat))
10634 0 : continue;
10635 :
10636 0 : tree output = make_ssa_name (new_vector_type);
10637 0 : tree input1 = pieces[in_start + (in_i / 2)];
10638 0 : tree input2 = pieces[in_start + (in_i / 2) + hi_start];
10639 0 : gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
10640 : input1, input2,
10641 : permutes[in_i & 1]);
10642 0 : gimple_seq_add_stmt (seq, stmt);
10643 0 : pieces[out_start + out_i] = output;
10644 0 : out_i += 1;
10645 : }
10646 0 : std::swap (in_start, out_start);
10647 0 : new_nvectors = out_i;
10648 : }
10649 :
10650 : /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
10651 0 : results.reserve (nresults);
10652 0 : for (unsigned int i = 0; i < nresults; ++i)
10653 0 : if (i < new_nvectors)
10654 0 : results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
10655 0 : pieces[in_start + i]));
10656 : else
10657 0 : results.quick_push (results[i - new_nvectors]);
10658 0 : }
10659 :
10660 :
10661 : /* For constant and loop invariant defs in OP_NODE this function creates
10662 : vector defs that will be used in the vectorized stmts and stores them
10663 : to SLP_TREE_VEC_DEFS of OP_NODE. */
10664 :
10665 : static void
10666 488821 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
10667 : {
10668 488821 : unsigned HOST_WIDE_INT nunits;
10669 488821 : tree vec_cst;
10670 488821 : unsigned j, number_of_places_left_in_vector;
10671 488821 : tree vector_type;
10672 488821 : tree vop;
10673 488821 : int group_size = op_node->ops.length ();
10674 488821 : unsigned int vec_num, i;
10675 488821 : unsigned number_of_copies = 1;
10676 488821 : bool constant_p;
10677 488821 : gimple_seq ctor_seq = NULL;
10678 488821 : auto_vec<tree, 16> permute_results;
10679 :
10680 : /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
10681 488821 : vector_type = SLP_TREE_VECTYPE (op_node);
10682 :
10683 488821 : unsigned int number_of_vectors = vect_get_num_copies (vinfo, op_node);
10684 488821 : SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
10685 488821 : auto_vec<tree> voprnds (number_of_vectors);
10686 :
10687 : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
10688 : created vectors. It is greater than 1 if unrolling is performed.
10689 :
10690 : For example, we have two scalar operands, s1 and s2 (e.g., group of
10691 : strided accesses of size two), while NUNITS is four (i.e., four scalars
10692 : of this type can be packed in a vector). The output vector will contain
10693 : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
10694 : will be 2).
10695 :
10696 : If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
10697 : containing the operands.
10698 :
10699 : For example, NUNITS is four as before, and the group size is 8
10700 : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
10701 : {s5, s6, s7, s8}. */
10702 :
10703 : /* When using duplicate_and_interleave, we just need one element for
10704 : each scalar statement. */
10705 488821 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
10706 : nunits = group_size;
10707 :
10708 488821 : number_of_copies = nunits * number_of_vectors / group_size;
10709 :
10710 488821 : number_of_places_left_in_vector = nunits;
10711 488821 : constant_p = true;
10712 488821 : tree uniform_elt = NULL_TREE;
10713 488821 : tree_vector_builder elts (vector_type, nunits, 1);
10714 488821 : elts.quick_grow (nunits);
10715 488821 : stmt_vec_info insert_after = NULL;
10716 1458816 : for (j = 0; j < number_of_copies; j++)
10717 : {
10718 969995 : tree op;
10719 3720084 : for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
10720 : {
10721 : /* Create 'vect_ = {op0,op1,...,opn}'. */
10722 1780094 : tree orig_op = op;
10723 1780094 : if (number_of_places_left_in_vector == nunits)
10724 : uniform_elt = op;
10725 1162205 : else if (uniform_elt && operand_equal_p (uniform_elt, op))
10726 739371 : op = elts[number_of_places_left_in_vector];
10727 : else
10728 : uniform_elt = NULL_TREE;
10729 1780094 : number_of_places_left_in_vector--;
10730 1780094 : if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
10731 : {
10732 273277 : if (CONSTANT_CLASS_P (op))
10733 : {
10734 99826 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10735 : {
10736 : /* Can't use VIEW_CONVERT_EXPR for booleans because
10737 : of possibly different sizes of scalar value and
10738 : vector element. */
10739 51 : if (integer_zerop (op))
10740 51 : op = build_int_cst (TREE_TYPE (vector_type), 0);
10741 0 : else if (integer_onep (op))
10742 0 : op = build_all_ones_cst (TREE_TYPE (vector_type));
10743 : else
10744 0 : gcc_unreachable ();
10745 : }
10746 : else
10747 99775 : op = fold_unary (VIEW_CONVERT_EXPR,
10748 : TREE_TYPE (vector_type), op);
10749 99826 : gcc_assert (op && CONSTANT_CLASS_P (op));
10750 : }
10751 : else
10752 : {
10753 173451 : tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
10754 173451 : gimple *init_stmt;
10755 173451 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10756 : {
10757 403 : tree true_val
10758 403 : = build_all_ones_cst (TREE_TYPE (vector_type));
10759 403 : tree false_val
10760 403 : = build_zero_cst (TREE_TYPE (vector_type));
10761 403 : gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
10762 403 : init_stmt = gimple_build_assign (new_temp, COND_EXPR,
10763 : op, true_val,
10764 : false_val);
10765 : }
10766 : else
10767 : {
10768 173048 : op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
10769 : op);
10770 173048 : init_stmt
10771 173048 : = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
10772 : op);
10773 : }
10774 173451 : gimple_seq_add_stmt (&ctor_seq, init_stmt);
10775 173451 : op = new_temp;
10776 : }
10777 : }
10778 1780094 : elts[number_of_places_left_in_vector] = op;
10779 1780094 : if (!CONSTANT_CLASS_P (op))
10780 314833 : constant_p = false;
10781 : /* For BB vectorization we have to compute an insert location
10782 : when a def is inside the analyzed region since we cannot
10783 : simply insert at the BB start in this case. */
10784 1780094 : stmt_vec_info opdef;
10785 1780094 : if (TREE_CODE (orig_op) == SSA_NAME
10786 181014 : && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
10787 161182 : && is_a <bb_vec_info> (vinfo)
10788 1883611 : && (opdef = vinfo->lookup_def (orig_op)))
10789 : {
10790 84671 : if (!insert_after)
10791 : insert_after = opdef;
10792 : else
10793 46730 : insert_after = get_later_stmt (insert_after, opdef);
10794 : }
10795 :
10796 1780094 : if (number_of_places_left_in_vector == 0)
10797 : {
10798 617889 : auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
10799 617889 : if (uniform_elt)
10800 645152 : vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
10801 322576 : elts[0]);
10802 590626 : else if (constant_p
10803 590626 : ? multiple_p (type_nunits, nunits)
10804 108388 : : known_eq (type_nunits, nunits))
10805 295313 : vec_cst = gimple_build_vector (&ctor_seq, &elts);
10806 : else
10807 : {
10808 0 : if (permute_results.is_empty ())
10809 0 : duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
10810 : elts, number_of_vectors,
10811 : permute_results);
10812 0 : vec_cst = permute_results[number_of_vectors - j - 1];
10813 : }
10814 617889 : if (!gimple_seq_empty_p (ctor_seq))
10815 : {
10816 135485 : if (insert_after)
10817 : {
10818 37941 : gimple_stmt_iterator gsi;
10819 37941 : if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
10820 : {
10821 592 : gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
10822 592 : gsi_insert_seq_before (&gsi, ctor_seq,
10823 : GSI_CONTINUE_LINKING);
10824 : }
10825 37349 : else if (!stmt_ends_bb_p (insert_after->stmt))
10826 : {
10827 37349 : gsi = gsi_for_stmt (insert_after->stmt);
10828 37349 : gsi_insert_seq_after (&gsi, ctor_seq,
10829 : GSI_CONTINUE_LINKING);
10830 : }
10831 : else
10832 : {
10833 : /* When we want to insert after a def where the
10834 : defining stmt throws then insert on the fallthru
10835 : edge. */
10836 0 : edge e = find_fallthru_edge
10837 0 : (gimple_bb (insert_after->stmt)->succs);
10838 0 : basic_block new_bb
10839 0 : = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
10840 0 : gcc_assert (!new_bb);
10841 : }
10842 : }
10843 : else
10844 97544 : vinfo->insert_seq_on_entry (NULL, ctor_seq);
10845 135485 : ctor_seq = NULL;
10846 : }
10847 617889 : voprnds.quick_push (vec_cst);
10848 617889 : insert_after = NULL;
10849 617889 : number_of_places_left_in_vector = nunits;
10850 617889 : constant_p = true;
10851 617889 : elts.new_vector (vector_type, nunits, 1);
10852 617889 : elts.quick_grow (nunits);
10853 : }
10854 : }
10855 : }
10856 :
10857 : /* Since the vectors are created in the reverse order, we should invert
10858 : them. */
10859 488821 : vec_num = voprnds.length ();
10860 1106710 : for (j = vec_num; j != 0; j--)
10861 : {
10862 617889 : vop = voprnds[j - 1];
10863 617889 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10864 : }
10865 :
10866 : /* In case that VF is greater than the unrolling factor needed for the SLP
10867 : group of stmts, NUMBER_OF_VECTORS to be created is greater than
10868 : NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
10869 : to replicate the vectors. */
10870 488821 : while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
10871 488821 : for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
10872 : i++)
10873 0 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10874 488821 : }
10875 :
10876 : /* Get the scalar definition of the Nth lane from SLP_NODE or NULL_TREE
10877 : if there is no definition for it in the scalar IL or it is not known. */
10878 :
10879 : tree
10880 2665 : vect_get_slp_scalar_def (slp_tree slp_node, unsigned n)
10881 : {
10882 2665 : if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
10883 : {
10884 2653 : if (!SLP_TREE_SCALAR_STMTS (slp_node).exists ())
10885 : return NULL_TREE;
10886 2653 : stmt_vec_info def = SLP_TREE_SCALAR_STMTS (slp_node)[n];
10887 2653 : if (!def)
10888 : return NULL_TREE;
10889 2653 : return gimple_get_lhs (STMT_VINFO_STMT (def));
10890 : }
10891 : else
10892 12 : return SLP_TREE_SCALAR_OPS (slp_node)[n];
10893 : }
10894 :
10895 : /* Get the Ith vectorized definition from SLP_NODE. */
10896 :
10897 : tree
10898 145197 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
10899 : {
10900 145197 : return SLP_TREE_VEC_DEFS (slp_node)[i];
10901 : }
10902 :
10903 : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
10904 :
10905 : void
10906 926415 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
10907 : {
10908 1852830 : vec_defs->create (SLP_TREE_VEC_DEFS (slp_node).length ());
10909 926415 : vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
10910 926415 : }
10911 :
10912 : /* Get N vectorized definitions for SLP_NODE. */
10913 :
10914 : void
10915 2965 : vect_get_slp_defs (vec_info *,
10916 : slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
10917 : {
10918 2965 : if (n == -1U)
10919 2965 : n = SLP_TREE_CHILDREN (slp_node).length ();
10920 :
10921 10681 : for (unsigned i = 0; i < n; ++i)
10922 : {
10923 7716 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
10924 7716 : vec<tree> vec_defs = vNULL;
10925 7716 : vect_get_slp_defs (child, &vec_defs);
10926 7716 : vec_oprnds->quick_push (vec_defs);
10927 : }
10928 2965 : }
10929 :
10930 : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
10931 : - PERM gives the permutation that the caller wants to use for NODE,
10932 : which might be different from SLP_LOAD_PERMUTATION.
10933 : - DUMP_P controls whether the function dumps information. */
10934 :
10935 : static bool
10936 129718 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
10937 : load_permutation_t &perm,
10938 : const vec<tree> &dr_chain,
10939 : gimple_stmt_iterator *gsi, poly_uint64 vf,
10940 : bool analyze_only, bool dump_p,
10941 : unsigned *n_perms, unsigned int *n_loads,
10942 : bool dce_chain)
10943 : {
10944 129718 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
10945 129718 : int vec_index = 0;
10946 129718 : tree vectype = SLP_TREE_VECTYPE (node);
10947 129718 : unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
10948 129718 : unsigned int mask_element;
10949 129718 : unsigned dr_group_size;
10950 129718 : machine_mode mode;
10951 :
10952 129718 : if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
10953 : {
10954 : /* We have both splats of the same non-grouped load and groups
10955 : of distinct invariant loads entering here. */
10956 1603 : unsigned max_idx = 0;
10957 8819 : for (auto idx : perm)
10958 4010 : max_idx = idx > max_idx ? idx : max_idx;
10959 1603 : dr_group_size = max_idx + 1;
10960 : }
10961 : else
10962 : {
10963 128115 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
10964 128115 : dr_group_size = DR_GROUP_SIZE (stmt_info);
10965 : }
10966 :
10967 129718 : mode = TYPE_MODE (vectype);
10968 129718 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10969 129718 : unsigned int nstmts = vect_get_num_copies (vinfo, node);
10970 :
10971 : /* Initialize the vect stmts of NODE to properly insert the generated
10972 : stmts later. */
10973 129718 : if (! analyze_only)
10974 56993 : for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
10975 21997 : SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
10976 :
10977 : /* Generate permutation masks for every NODE. Number of masks for each NODE
10978 : is equal to GROUP_SIZE.
10979 : E.g., we have a group of three nodes with three loads from the same
10980 : location in each node, and the vector size is 4. I.e., we have a
10981 : a0b0c0a1b1c1... sequence and we need to create the following vectors:
10982 : for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
10983 : for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
10984 : ...
10985 :
10986 : The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
10987 : The last mask is illegal since we assume two operands for permute
10988 : operation, and the mask element values can't be outside that range.
10989 : Hence, the last mask must be converted into {2,5,5,5}.
10990 : For the first two permutations we need the first and the second input
10991 : vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
10992 : we need the second and the third vectors: {b1,c1,a2,b2} and
10993 : {c2,a3,b3,c3}. */
10994 :
10995 129718 : int vect_stmts_counter = 0;
10996 129718 : unsigned int index = 0;
10997 129718 : int first_vec_index = -1;
10998 129718 : int second_vec_index = -1;
10999 129718 : bool noop_p = true;
11000 129718 : *n_perms = 0;
11001 :
11002 129718 : vec_perm_builder mask;
11003 129718 : unsigned int nelts_to_build;
11004 129718 : unsigned int nvectors_per_build;
11005 129718 : unsigned int in_nlanes;
11006 129718 : bool repeating_p = (group_size == dr_group_size
11007 164787 : && multiple_p (nunits, group_size));
11008 129718 : if (repeating_p)
11009 : {
11010 : /* A single vector contains a whole number of copies of the node, so:
11011 : (a) all permutes can use the same mask; and
11012 : (b) the permutes only need a single vector input. */
11013 32843 : mask.new_vector (nunits, group_size, 3);
11014 32843 : nelts_to_build = mask.encoded_nelts ();
11015 : /* It's possible to obtain zero nstmts during analyze_only, so make
11016 : it at least one to ensure the later computation for n_perms
11017 : proceed. */
11018 32843 : nvectors_per_build = nstmts > 0 ? nstmts : 1;
11019 32843 : in_nlanes = dr_group_size * 3;
11020 : }
11021 : else
11022 : {
11023 : /* We need to construct a separate mask for each vector statement. */
11024 96875 : unsigned HOST_WIDE_INT const_nunits, const_vf;
11025 96875 : if (!nunits.is_constant (&const_nunits)
11026 96875 : || !vf.is_constant (&const_vf))
11027 : return false;
11028 96875 : mask.new_vector (const_nunits, const_nunits, 1);
11029 96875 : nelts_to_build = const_vf * group_size;
11030 96875 : nvectors_per_build = 1;
11031 96875 : in_nlanes = const_vf * dr_group_size;
11032 : }
11033 129718 : auto_sbitmap used_in_lanes (in_nlanes);
11034 129718 : bitmap_clear (used_in_lanes);
11035 129718 : auto_bitmap used_defs;
11036 :
11037 129718 : unsigned int count = mask.encoded_nelts ();
11038 129718 : mask.quick_grow (count);
11039 129718 : vec_perm_indices indices;
11040 :
11041 689367 : for (unsigned int j = 0; j < nelts_to_build; j++)
11042 : {
11043 569265 : unsigned int iter_num = j / group_size;
11044 569265 : unsigned int stmt_num = j % group_size;
11045 569265 : unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
11046 569265 : bitmap_set_bit (used_in_lanes, i);
11047 569265 : if (repeating_p)
11048 : {
11049 : first_vec_index = 0;
11050 : mask_element = i;
11051 : }
11052 : else
11053 : {
11054 : /* Enforced before the loop when !repeating_p. */
11055 358797 : unsigned int const_nunits = nunits.to_constant ();
11056 358797 : vec_index = i / const_nunits;
11057 358797 : mask_element = i % const_nunits;
11058 358797 : if (vec_index == first_vec_index
11059 358797 : || first_vec_index == -1)
11060 : {
11061 : first_vec_index = vec_index;
11062 : }
11063 143598 : else if (vec_index == second_vec_index
11064 143598 : || second_vec_index == -1)
11065 : {
11066 137505 : second_vec_index = vec_index;
11067 137505 : mask_element += const_nunits;
11068 : }
11069 : else
11070 : {
11071 6093 : if (dump_p)
11072 280 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11073 : "permutation requires at "
11074 : "least three vectors %G",
11075 : stmt_info->stmt);
11076 6093 : gcc_assert (analyze_only);
11077 : return false;
11078 : }
11079 :
11080 352704 : gcc_assert (mask_element < 2 * const_nunits);
11081 : }
11082 :
11083 563172 : if (mask_element != index)
11084 362611 : noop_p = false;
11085 563172 : mask[index++] = mask_element;
11086 :
11087 563172 : if (index == count)
11088 : {
11089 152983 : if (!noop_p)
11090 : {
11091 210392 : indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
11092 124574 : if (!can_vec_perm_const_p (mode, mode, indices))
11093 : {
11094 3523 : if (dump_p)
11095 : {
11096 79 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11097 : "unsupported vect permute { ");
11098 669 : for (i = 0; i < count; ++i)
11099 : {
11100 590 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11101 590 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11102 : }
11103 79 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11104 : }
11105 3523 : gcc_assert (analyze_only);
11106 : return false;
11107 : }
11108 :
11109 121051 : tree mask_vec = NULL_TREE;
11110 121051 : if (!analyze_only)
11111 20320 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11112 :
11113 121051 : if (second_vec_index == -1)
11114 36812 : second_vec_index = first_vec_index;
11115 :
11116 244976 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
11117 : {
11118 123925 : ++*n_perms;
11119 123925 : if (analyze_only)
11120 103323 : continue;
11121 : /* Generate the permute statement if necessary. */
11122 20602 : tree first_vec = dr_chain[first_vec_index + ri];
11123 20602 : tree second_vec = dr_chain[second_vec_index + ri];
11124 20602 : gassign *stmt = as_a<gassign *> (stmt_info->stmt);
11125 20602 : tree perm_dest
11126 20602 : = vect_create_destination_var (gimple_assign_lhs (stmt),
11127 : vectype);
11128 20602 : perm_dest = make_ssa_name (perm_dest);
11129 20602 : gimple *perm_stmt
11130 20602 : = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
11131 : second_vec, mask_vec);
11132 20602 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
11133 : gsi);
11134 20602 : if (dce_chain)
11135 : {
11136 19913 : bitmap_set_bit (used_defs, first_vec_index + ri);
11137 19913 : bitmap_set_bit (used_defs, second_vec_index + ri);
11138 : }
11139 :
11140 : /* Store the vector statement in NODE. */
11141 20602 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
11142 : }
11143 : }
11144 28409 : else if (!analyze_only)
11145 : {
11146 2790 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
11147 : {
11148 1395 : tree first_vec = dr_chain[first_vec_index + ri];
11149 : /* If mask was NULL_TREE generate the requested
11150 : identity transform. */
11151 1395 : if (dce_chain)
11152 1388 : bitmap_set_bit (used_defs, first_vec_index + ri);
11153 :
11154 : /* Store the vector statement in NODE. */
11155 1395 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
11156 : }
11157 : }
11158 :
11159 : index = 0;
11160 : first_vec_index = -1;
11161 : second_vec_index = -1;
11162 : noop_p = true;
11163 : }
11164 : }
11165 :
11166 120102 : if (n_loads)
11167 : {
11168 81631 : if (repeating_p)
11169 10774 : *n_loads = nstmts;
11170 : else
11171 : {
11172 : /* Enforced above when !repeating_p. */
11173 70857 : unsigned int const_nunits = nunits.to_constant ();
11174 70857 : *n_loads = 0;
11175 70857 : bool load_seen = false;
11176 990289 : for (unsigned i = 0; i < in_nlanes; ++i)
11177 : {
11178 919432 : if (i % const_nunits == 0)
11179 : {
11180 389136 : if (load_seen)
11181 110468 : *n_loads += 1;
11182 : load_seen = false;
11183 : }
11184 919432 : if (bitmap_bit_p (used_in_lanes, i))
11185 253112 : load_seen = true;
11186 : }
11187 70857 : if (load_seen)
11188 48327 : *n_loads += 1;
11189 : }
11190 : }
11191 :
11192 120102 : if (dce_chain)
11193 218550 : for (unsigned i = 0; i < dr_chain.length (); ++i)
11194 71890 : if (!bitmap_bit_p (used_defs, i))
11195 : {
11196 39337 : tree def = dr_chain[i];
11197 39681 : do
11198 : {
11199 39681 : gimple *stmt = SSA_NAME_DEF_STMT (def);
11200 39681 : if (is_gimple_assign (stmt)
11201 39681 : && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
11202 39681 : || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
11203 4913 : def = single_ssa_tree_operand (stmt, SSA_OP_USE);
11204 : else
11205 : def = NULL;
11206 39681 : gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
11207 39681 : gsi_remove (&rgsi, true);
11208 39681 : release_defs (stmt);
11209 : }
11210 39681 : while (def);
11211 : }
11212 :
11213 : return true;
11214 129718 : }
11215 :
11216 : /* Generate vector permute statements from a list of loads in DR_CHAIN.
11217 : If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
11218 : permute statements for the SLP node NODE. Store the number of vector
11219 : permute instructions in *N_PERMS and the number of vector load
11220 : instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
11221 : that were not needed. */
11222 :
11223 : bool
11224 90312 : vect_transform_slp_perm_load (vec_info *vinfo,
11225 : slp_tree node, const vec<tree> &dr_chain,
11226 : gimple_stmt_iterator *gsi, poly_uint64 vf,
11227 : bool analyze_only, unsigned *n_perms,
11228 : unsigned int *n_loads, bool dce_chain)
11229 : {
11230 90312 : return vect_transform_slp_perm_load_1 (vinfo, node,
11231 90312 : SLP_TREE_LOAD_PERMUTATION (node),
11232 : dr_chain, gsi, vf, analyze_only,
11233 : dump_enabled_p (), n_perms, n_loads,
11234 90312 : dce_chain);
11235 : }
11236 :
11237 : /* Produce the next vector result for SLP permutation NODE by adding a vector
11238 : statement at GSI. If MASK_VEC is nonnull, add:
11239 :
11240 : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
11241 :
11242 : otherwise add:
11243 :
11244 : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF,
11245 : { N, N+1, N+2, ... }>
11246 :
11247 : where N == IDENTITY_OFFSET which is either zero or equal to the
11248 : number of elements of the result. */
11249 :
11250 : static void
11251 31239 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11252 : slp_tree node, tree first_def, tree second_def,
11253 : tree mask_vec, poly_uint64 identity_offset)
11254 : {
11255 31239 : tree vectype = SLP_TREE_VECTYPE (node);
11256 :
11257 : /* ??? We SLP match existing vector element extracts but
11258 : allow punning which we need to re-instantiate at uses
11259 : but have no good way of explicitly representing. */
11260 31239 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
11261 31239 : && !types_compatible_p (TREE_TYPE (first_def), vectype))
11262 : {
11263 14 : gassign *conv_stmt
11264 14 : = gimple_build_assign (make_ssa_name (vectype),
11265 : build1 (VIEW_CONVERT_EXPR, vectype, first_def));
11266 14 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
11267 14 : first_def = gimple_assign_lhs (conv_stmt);
11268 : }
11269 31239 : gassign *perm_stmt;
11270 31239 : tree perm_dest = make_ssa_name (vectype);
11271 31239 : if (mask_vec)
11272 : {
11273 27933 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
11274 27933 : TYPE_SIZE (vectype))
11275 27933 : && !types_compatible_p (TREE_TYPE (second_def), vectype))
11276 : {
11277 8 : gassign *conv_stmt
11278 8 : = gimple_build_assign (make_ssa_name (vectype),
11279 : build1 (VIEW_CONVERT_EXPR,
11280 : vectype, second_def));
11281 8 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
11282 8 : second_def = gimple_assign_lhs (conv_stmt);
11283 : }
11284 27933 : perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
11285 : first_def, second_def,
11286 : mask_vec);
11287 : }
11288 : else
11289 : {
11290 3306 : auto def_nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
11291 3306 : unsigned HOST_WIDE_INT vecno;
11292 3306 : poly_uint64 eltno;
11293 3306 : if (!can_div_trunc_p (poly_uint64 (identity_offset), def_nunits,
11294 : &vecno, &eltno))
11295 : gcc_unreachable ();
11296 3306 : tree def = vecno & 1 ? second_def : first_def;
11297 3306 : if (!types_compatible_p (TREE_TYPE (def), vectype))
11298 : {
11299 : /* For identity permutes we still need to handle the case
11300 : of offsetted extracts or concats. */
11301 261 : unsigned HOST_WIDE_INT c;
11302 261 : if (known_le (TYPE_VECTOR_SUBPARTS (vectype), def_nunits))
11303 : {
11304 257 : unsigned HOST_WIDE_INT elsz
11305 257 : = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (def))));
11306 514 : tree lowpart = build3 (BIT_FIELD_REF, vectype, def,
11307 257 : TYPE_SIZE (vectype),
11308 257 : bitsize_int (eltno * elsz));
11309 257 : perm_stmt = gimple_build_assign (perm_dest, lowpart);
11310 : }
11311 4 : else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
11312 4 : def_nunits, &c) && c == 2)
11313 : {
11314 4 : gcc_assert (known_eq (identity_offset, 0U));
11315 4 : tree ctor = build_constructor_va (vectype, 2,
11316 : NULL_TREE, first_def,
11317 : NULL_TREE, second_def);
11318 4 : perm_stmt = gimple_build_assign (perm_dest, ctor);
11319 : }
11320 : else
11321 0 : gcc_unreachable ();
11322 : }
11323 : else
11324 : {
11325 : /* We need a copy here in case the def was external. */
11326 3045 : gcc_assert (known_eq (eltno, 0U));
11327 3045 : perm_stmt = gimple_build_assign (perm_dest, def);
11328 : }
11329 : }
11330 31239 : vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
11331 : /* Store the vector statement in NODE. */
11332 31239 : node->push_vec_def (perm_stmt);
11333 31239 : }
11334 :
11335 : /* Subroutine of vectorizable_slp_permutation. Check whether the target
11336 : can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
11337 : If GSI is nonnull, emit the permutation there.
11338 :
11339 : When GSI is null, the only purpose of NODE is to give properties
11340 : of the result, such as the vector type and number of SLP lanes.
11341 : The node does not need to be a VEC_PERM_EXPR.
11342 :
11343 : If the target supports the operation, return the number of individual
11344 : VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
11345 : dump file if DUMP_P is true. */
11346 :
11347 : static int
11348 488709 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
11349 : slp_tree node, lane_permutation_t &perm,
11350 : vec<slp_tree> &children, bool dump_p)
11351 : {
11352 488709 : tree vectype = SLP_TREE_VECTYPE (node);
11353 :
11354 : /* ??? We currently only support all same vector input types
11355 : while the SLP IL should really do a concat + select and thus accept
11356 : arbitrary mismatches. */
11357 488709 : slp_tree child;
11358 488709 : unsigned i;
11359 488709 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
11360 488709 : bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
11361 : /* True if we're permuting a single input of 2N vectors down
11362 : to N vectors. This case doesn't generalize beyond 2 since
11363 : VEC_PERM_EXPR only takes 2 inputs. */
11364 488709 : bool pack_p = false;
11365 : /* If we're permuting inputs of N vectors each into X*N outputs,
11366 : this is the value of X, otherwise it is 1. */
11367 488709 : unsigned int unpack_factor = 1;
11368 488709 : tree op_vectype = NULL_TREE;
11369 490276 : FOR_EACH_VEC_ELT (children, i, child)
11370 490201 : if (SLP_TREE_VECTYPE (child))
11371 : {
11372 : op_vectype = SLP_TREE_VECTYPE (child);
11373 : break;
11374 : }
11375 488709 : if (!op_vectype)
11376 75 : op_vectype = vectype;
11377 1061557 : FOR_EACH_VEC_ELT (children, i, child)
11378 : {
11379 572848 : if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
11380 10464 : && !vect_maybe_update_slp_op_vectype (child, op_vectype))
11381 572848 : || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
11382 1145696 : || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
11383 : {
11384 0 : if (dump_p)
11385 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11386 : "Unsupported vector types in lane permutation\n");
11387 0 : return -1;
11388 : }
11389 572848 : auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
11390 572848 : unsigned int this_unpack_factor;
11391 : /* Detect permutations of external, pre-existing vectors. The external
11392 : node's SLP_TREE_LANES stores the total number of units in the vector,
11393 : or zero if the vector has variable length.
11394 :
11395 : We are expected to keep the original VEC_PERM_EXPR for such cases.
11396 : There is no repetition to model. */
11397 572848 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def
11398 572848 : && SLP_TREE_SCALAR_OPS (child).is_empty ())
11399 : repeating_p = false;
11400 : /* Check whether the input has twice as many lanes per vector. */
11401 564947 : else if (children.length () == 1
11402 564947 : && known_eq (SLP_TREE_LANES (child) * nunits,
11403 : SLP_TREE_LANES (node) * op_nunits * 2))
11404 : pack_p = true;
11405 : /* Check whether the output has N times as many lanes per vector. */
11406 572848 : else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
11407 521147 : SLP_TREE_LANES (child) * nunits,
11408 : &this_unpack_factor)
11409 486186 : && (i == 0 || unpack_factor == this_unpack_factor))
11410 : unpack_factor = this_unpack_factor;
11411 : else
11412 : repeating_p = false;
11413 : }
11414 :
11415 977418 : gcc_assert (perm.length () == SLP_TREE_LANES (node));
11416 :
11417 : /* Load-lanes permute. This permute only acts as a forwarder to
11418 : select the correct vector def of the load-lanes load which
11419 : has the permuted vectors in its vector defs like
11420 : { v0, w0, r0, v1, w1, r1 ... } for a ld3. All costs are
11421 : accounted for in the costing for the actual load so we
11422 : return zero here. */
11423 488709 : if (node->ldst_lanes)
11424 : {
11425 0 : gcc_assert (children.length () == 1);
11426 0 : if (!gsi)
11427 : /* This is a trivial op always supported. */
11428 : return 0;
11429 0 : slp_tree child = children[0];
11430 0 : unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
11431 0 : / SLP_TREE_LANES (node));
11432 0 : unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
11433 0 : unsigned nvectors = vect_get_num_copies (vinfo, node);
11434 0 : for (unsigned i = 0; i < nvectors; ++i)
11435 : {
11436 0 : tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
11437 0 : node->push_vec_def (def);
11438 : }
11439 : return 0;
11440 : }
11441 :
11442 : /* Set REPEATING_P to true if the permutations are cyclical wrt UNPACK_FACTOR
11443 : and if we can generate the vectors in a vector-length agnostic way.
11444 : This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
11445 : compile time.
11446 :
11447 : The significance of UNPACK_STEP is that, when PACK_P is false,
11448 : output vector I operates on a window of UNPACK_STEP elements from each
11449 : input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR). For example,
11450 : when UNPACK_FACTOR is 2, the first output vector operates on lanes
11451 : [0, NUNITS / 2 - 1] of each input vector and the second output vector
11452 : operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
11453 :
11454 : When REPEATING_P is true, NOUTPUTS holds the total number of outputs
11455 : that we actually need to generate. */
11456 488709 : uint64_t noutputs = 0;
11457 488709 : poly_uint64 unpack_step = 0;
11458 488709 : loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
11459 182184 : if (!linfo
11460 527855 : || !multiple_p (nunits, unpack_factor, &unpack_step)
11461 181247 : || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
11462 181247 : * SLP_TREE_LANES (node), nunits, &noutputs))
11463 : repeating_p = false;
11464 :
11465 : /* We can handle the conditions described for REPEATING_P above for
11466 : both variable- and constant-length vectors. The fallback requires
11467 : us to generate every element of every permute vector explicitly,
11468 : which is only possible for constant-length permute vectors.
11469 :
11470 : Set:
11471 :
11472 : - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
11473 : mask vectors that we want to build.
11474 :
11475 : - NCOPIES to the number of copies of PERM that we need in order
11476 : to build the necessary permute mask vectors. */
11477 181247 : uint64_t npatterns;
11478 181247 : unsigned nelts_per_pattern;
11479 181247 : uint64_t ncopies;
11480 181247 : if (repeating_p)
11481 : {
11482 : /* We need permute mask vectors that have the form:
11483 :
11484 : { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
11485 :
11486 : In other words, the original n-element permute in PERM is
11487 : "unrolled" to fill a full vector. The stepped vector encoding
11488 : that we use for permutes requires 3n elements. */
11489 142101 : npatterns = SLP_TREE_LANES (node);
11490 142101 : nelts_per_pattern = ncopies = 3;
11491 : }
11492 : else
11493 : {
11494 : /* Calculate every element of every permute mask vector explicitly,
11495 : instead of relying on the pattern described above. */
11496 346608 : if (!nunits.is_constant (&npatterns)
11497 346608 : || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
11498 : {
11499 : if (dump_p)
11500 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11501 : "unsupported permutation %p on variable-length"
11502 : " vectors\n", (void *) node);
11503 : return -1;
11504 : }
11505 346608 : nelts_per_pattern = ncopies = 1;
11506 346608 : if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
11507 : {
11508 : if (dump_p)
11509 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11510 : "unsupported permutation %p for variable VF\n",
11511 : (void *) node);
11512 : return -1;
11513 : }
11514 : pack_p = false;
11515 : unpack_factor = 1;
11516 : }
11517 488709 : unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
11518 488709 : gcc_assert (repeating_p || multiple_p (olanes, nunits));
11519 :
11520 : /* Compute the { { SLP operand, vector index}, lane } permutation sequence
11521 : from the { SLP operand, scalar lane } permutation as recorded in the
11522 : SLP node as intermediate step. This part should already work
11523 : with SLP children with arbitrary number of lanes. */
11524 488709 : auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
11525 488709 : auto_vec<poly_uint64> active_lane;
11526 488709 : vperm.create (olanes);
11527 488709 : active_lane.safe_grow_cleared (children.length (), true);
11528 985645 : for (unsigned int ui = 0; ui < unpack_factor; ++ui)
11529 : {
11530 2172428 : for (unsigned j = 0; j < children.length (); ++j)
11531 589278 : active_lane[j] = ui * unpack_step;
11532 1394498 : for (unsigned i = 0; i < ncopies; ++i)
11533 : {
11534 5588748 : for (unsigned pi = 0; pi < perm.length (); ++pi)
11535 : {
11536 1896812 : std::pair<unsigned, unsigned> p = perm[pi];
11537 1896812 : tree vtype = SLP_TREE_VECTYPE (children[p.first]);
11538 1896812 : if (repeating_p)
11539 827706 : vperm.quick_push ({{p.first, 0},
11540 827706 : p.second + active_lane[p.first]});
11541 : else
11542 : {
11543 : /* We checked above that the vectors are constant-length. */
11544 1069106 : unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
11545 1069106 : .to_constant ();
11546 1069106 : unsigned lane = active_lane[p.first].to_constant ();
11547 1069106 : unsigned vi = (lane + p.second) / vnunits;
11548 1069106 : unsigned vl = (lane + p.second) % vnunits;
11549 1069106 : vperm.quick_push ({{p.first, vi}, vl});
11550 : }
11551 : }
11552 : /* Advance to the next group. */
11553 1951336 : for (unsigned j = 0; j < children.length (); ++j)
11554 1053774 : active_lane[j] += SLP_TREE_LANES (children[j]);
11555 : }
11556 : }
11557 :
11558 488709 : if (dump_p)
11559 : {
11560 8909 : dump_printf_loc (MSG_NOTE, vect_location,
11561 : "vectorizing permutation %p", (void *)node);
11562 32209 : for (unsigned i = 0; i < perm.length (); ++i)
11563 23300 : dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
11564 8909 : if (repeating_p)
11565 7502 : dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
11566 8909 : dump_printf (MSG_NOTE, "\n");
11567 8909 : dump_printf_loc (MSG_NOTE, vect_location, "as");
11568 89301 : for (unsigned i = 0; i < vperm.length (); ++i)
11569 : {
11570 80392 : if (i != 0
11571 80392 : && (repeating_p
11572 54232 : ? multiple_p (i, npatterns)
11573 59784 : : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
11574 24113 : dump_printf (MSG_NOTE, ",");
11575 80392 : dump_printf (MSG_NOTE, " vops%u[%u][",
11576 80392 : vperm[i].first.first, vperm[i].first.second);
11577 80392 : dump_dec (MSG_NOTE, vperm[i].second);
11578 80392 : dump_printf (MSG_NOTE, "]");
11579 : }
11580 8909 : dump_printf (MSG_NOTE, "\n");
11581 : }
11582 :
11583 : /* We can only handle two-vector permutes, everything else should
11584 : be lowered on the SLP level. The following is closely inspired
11585 : by vect_transform_slp_perm_load and is supposed to eventually
11586 : replace it.
11587 : ??? As intermediate step do code-gen in the SLP tree representation
11588 : somehow? */
11589 488709 : std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
11590 488709 : std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
11591 488709 : unsigned int index = 0;
11592 488709 : poly_uint64 mask_element;
11593 488709 : vec_perm_builder mask;
11594 488709 : mask.new_vector (nunits, npatterns, nelts_per_pattern);
11595 488709 : unsigned int count = mask.encoded_nelts ();
11596 488709 : mask.quick_grow (count);
11597 488709 : vec_perm_indices indices;
11598 488709 : unsigned nperms = 0;
11599 : /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
11600 : vectors to check during analysis, but we need to generate NOUTPUTS
11601 : vectors during transformation. */
11602 488709 : unsigned total_nelts = olanes;
11603 488709 : unsigned process_nelts = olanes;
11604 488709 : if (repeating_p)
11605 : {
11606 142101 : total_nelts = (total_nelts / unpack_factor) * noutputs;
11607 142101 : if (gsi)
11608 9805 : process_nelts = total_nelts;
11609 : }
11610 488709 : unsigned last_ei = (total_nelts - 1) % process_nelts;
11611 2394807 : for (unsigned i = 0; i < process_nelts; ++i)
11612 : {
11613 : /* VI is the input vector index when generating code for REPEATING_P. */
11614 1913436 : unsigned vi = i / olanes * (pack_p ? 2 : 1);
11615 1913436 : unsigned ei = i % olanes;
11616 1913436 : mask_element = vperm[ei].second;
11617 1913436 : if (pack_p)
11618 : {
11619 : /* In this case, we have N outputs and the single child provides 2N
11620 : inputs. Output X permutes inputs 2X and 2X+1.
11621 :
11622 : The mask indices are taken directly from the SLP permutation node.
11623 : Index X selects from the first vector if (X / NUNITS) % 2 == 0;
11624 : X selects from the second vector otherwise. These conditions
11625 : are only known at compile time for constant-length vectors. */
11626 : first_vec = std::make_pair (0, 0);
11627 : second_vec = std::make_pair (0, 1);
11628 : }
11629 1744719 : else if (first_vec.first == -1U
11630 1744719 : || first_vec == vperm[ei].first)
11631 1512510 : first_vec = vperm[ei].first;
11632 232209 : else if (second_vec.first == -1U
11633 232209 : || second_vec == vperm[ei].first)
11634 : {
11635 231812 : second_vec = vperm[ei].first;
11636 231812 : mask_element += nunits;
11637 : }
11638 : else
11639 : {
11640 397 : if (dump_p)
11641 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11642 : "permutation requires at "
11643 : "least three vectors\n");
11644 397 : gcc_assert (!gsi);
11645 : return -1;
11646 : }
11647 :
11648 1913039 : mask[index++] = mask_element;
11649 :
11650 1913039 : if (index == count)
11651 : {
11652 805465 : indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
11653 : TYPE_VECTOR_SUBPARTS (op_vectype));
11654 631258 : bool identity_p = (indices.series_p (0, 1, mask[0], 1)
11655 965662 : && constant_multiple_p (mask[0], nunits));
11656 631258 : machine_mode vmode = TYPE_MODE (vectype);
11657 631258 : machine_mode op_vmode = TYPE_MODE (op_vectype);
11658 631258 : unsigned HOST_WIDE_INT c;
11659 631258 : if ((!identity_p
11660 587934 : && !can_vec_perm_const_p (vmode, op_vmode, indices))
11661 631258 : || (identity_p
11662 43324 : && !known_le (nunits,
11663 : TYPE_VECTOR_SUBPARTS (op_vectype))
11664 6949 : && (!constant_multiple_p (nunits,
11665 8 : TYPE_VECTOR_SUBPARTS (op_vectype),
11666 8 : &c) || c != 2)))
11667 : {
11668 6941 : if (dump_p)
11669 : {
11670 152 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
11671 : vect_location,
11672 : "unsupported vect permute { ");
11673 1586 : for (i = 0; i < count; ++i)
11674 : {
11675 1434 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11676 1434 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11677 : }
11678 152 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11679 : }
11680 6941 : gcc_assert (!gsi);
11681 7338 : return -1;
11682 : }
11683 :
11684 624317 : if (!identity_p)
11685 580993 : nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
11686 624317 : if (gsi)
11687 : {
11688 31239 : if (second_vec.first == -1U)
11689 7055 : second_vec = first_vec;
11690 :
11691 31239 : slp_tree
11692 31239 : first_node = children[first_vec.first],
11693 31239 : second_node = children[second_vec.first];
11694 :
11695 31239 : tree mask_vec = NULL_TREE;
11696 31239 : if (!identity_p)
11697 27933 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11698 :
11699 31239 : tree first_def
11700 31239 : = vect_get_slp_vect_def (first_node, first_vec.second + vi);
11701 31239 : tree second_def
11702 31239 : = vect_get_slp_vect_def (second_node, second_vec.second + vi);
11703 31239 : vect_add_slp_permutation (vinfo, gsi, node, first_def,
11704 31239 : second_def, mask_vec, mask[0]);
11705 : }
11706 :
11707 : index = 0;
11708 : first_vec = std::make_pair (-1U, -1U);
11709 : second_vec = std::make_pair (-1U, -1U);
11710 : }
11711 : }
11712 :
11713 481371 : return nperms;
11714 488709 : }
11715 :
11716 : /* Vectorize the SLP permutations in NODE as specified
11717 : in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
11718 : child number and lane number.
11719 : Interleaving of two two-lane two-child SLP subtrees (not supported):
11720 : [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
11721 : A blend of two four-lane two-child SLP subtrees:
11722 : [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
11723 : Highpart of a four-lane one-child SLP subtree (not supported):
11724 : [ { 0, 2 }, { 0, 3 } ]
11725 : Where currently only a subset is supported by code generating below. */
11726 :
11727 : bool
11728 139180 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11729 : slp_tree node, stmt_vector_for_cost *cost_vec)
11730 : {
11731 139180 : tree vectype = SLP_TREE_VECTYPE (node);
11732 139180 : lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
11733 139180 : int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
11734 139180 : SLP_TREE_CHILDREN (node),
11735 : dump_enabled_p ());
11736 139180 : if (nperms < 0)
11737 : return false;
11738 :
11739 137853 : if (!gsi && nperms != 0)
11740 115894 : record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
11741 :
11742 : return true;
11743 : }
11744 :
11745 : /* Vectorize SLP NODE. */
11746 :
11747 : static void
11748 1464892 : vect_schedule_slp_node (vec_info *vinfo,
11749 : slp_tree node, slp_instance instance)
11750 : {
11751 1464892 : gimple_stmt_iterator si;
11752 1464892 : int i;
11753 1464892 : slp_tree child;
11754 :
11755 : /* Vectorize externals and constants. */
11756 1464892 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
11757 1464892 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
11758 : {
11759 : /* ??? vectorizable_shift can end up using a scalar operand which is
11760 : currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
11761 : node in this case. */
11762 496520 : if (!SLP_TREE_VECTYPE (node))
11763 496520 : return;
11764 :
11765 : /* There are two reasons vector defs might already exist. The first
11766 : is that we are vectorizing an existing vector def. The second is
11767 : when performing BB vectorization shared constant/external nodes
11768 : are not split apart during partitioning so during the code-gen
11769 : DFS walk we can end up visiting them twice. */
11770 489644 : if (! SLP_TREE_VEC_DEFS (node).exists ())
11771 488821 : vect_create_constant_vectors (vinfo, node);
11772 489644 : return;
11773 : }
11774 :
11775 968372 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
11776 :
11777 968372 : gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
11778 968372 : if (SLP_TREE_VECTYPE (node))
11779 968366 : SLP_TREE_VEC_DEFS (node).create (vect_get_num_copies (vinfo, node));
11780 :
11781 968372 : if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
11782 : {
11783 : /* Vectorized loads go before the first scalar load to make it
11784 : ready early, vectorized stores go before the last scalar
11785 : stmt which is where all uses are ready. */
11786 708467 : stmt_vec_info last_stmt_info = NULL;
11787 708467 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
11788 165285 : last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
11789 : else /* DR_IS_WRITE */
11790 543182 : last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
11791 708467 : si = gsi_for_stmt (last_stmt_info->stmt);
11792 708467 : }
11793 259905 : else if (!SLP_TREE_PERMUTE_P (node)
11794 243462 : && (SLP_TREE_TYPE (node) == cycle_phi_info_type
11795 : || SLP_TREE_TYPE (node) == induc_vec_info_type
11796 : || SLP_TREE_TYPE (node) == phi_info_type))
11797 : {
11798 : /* For PHI node vectorization we do not use the insertion iterator. */
11799 53814 : si = gsi_none ();
11800 : }
11801 : else
11802 : {
11803 : /* Emit other stmts after the children vectorized defs which is
11804 : earliest possible. */
11805 : gimple *last_stmt = NULL;
11806 : bool seen_vector_def = false;
11807 573176 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11808 367085 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
11809 : {
11810 : /* For fold-left reductions we are retaining the scalar
11811 : reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
11812 : set so the representation isn't perfect. Resort to the
11813 : last scalar def here. */
11814 294343 : if (SLP_TREE_VEC_DEFS (child).is_empty ())
11815 : {
11816 862 : gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type);
11817 862 : gphi *phi = as_a <gphi *>
11818 862 : (vect_find_last_scalar_stmt_in_slp (child)->stmt);
11819 862 : if (!last_stmt)
11820 : last_stmt = phi;
11821 647 : else if (vect_stmt_dominates_stmt_p (last_stmt, phi))
11822 : last_stmt = phi;
11823 636 : else if (vect_stmt_dominates_stmt_p (phi, last_stmt))
11824 : ;
11825 : else
11826 0 : gcc_unreachable ();
11827 : }
11828 : /* We are emitting all vectorized stmts in the same place and
11829 : the last one is the last.
11830 : ??? Unless we have a load permutation applied and that
11831 : figures to re-use an earlier generated load. */
11832 : unsigned j;
11833 : tree vdef;
11834 696343 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11835 : {
11836 402000 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11837 402000 : if (!last_stmt)
11838 : last_stmt = vstmt;
11839 206596 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11840 : last_stmt = vstmt;
11841 45172 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
11842 : ;
11843 : else
11844 0 : gcc_unreachable ();
11845 : }
11846 : }
11847 72742 : else if (!SLP_TREE_VECTYPE (child))
11848 : {
11849 : /* For externals we use unvectorized at all scalar defs. */
11850 : unsigned j;
11851 : tree def;
11852 14631 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
11853 8391 : if (TREE_CODE (def) == SSA_NAME
11854 8391 : && !SSA_NAME_IS_DEFAULT_DEF (def))
11855 : {
11856 295 : gimple *stmt = SSA_NAME_DEF_STMT (def);
11857 295 : if (gimple_uid (stmt) == -1u)
11858 : /* If the stmt is not inside the region do not
11859 : use it as possible insertion point. */
11860 : ;
11861 285 : else if (!last_stmt)
11862 : last_stmt = stmt;
11863 261 : else if (vect_stmt_dominates_stmt_p (last_stmt, stmt))
11864 : last_stmt = stmt;
11865 159 : else if (vect_stmt_dominates_stmt_p (stmt, last_stmt))
11866 : ;
11867 : else
11868 0 : gcc_unreachable ();
11869 : }
11870 : }
11871 : else
11872 : {
11873 : /* For externals we have to look at all defs since their
11874 : insertion place is decided per vector. But beware
11875 : of pre-existing vectors where we need to make sure
11876 : we do not insert before the region boundary. */
11877 66502 : if (SLP_TREE_SCALAR_OPS (child).is_empty ()
11878 650 : && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
11879 : seen_vector_def = true;
11880 : else
11881 : {
11882 : unsigned j;
11883 : tree vdef;
11884 527887 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11885 94417 : if (TREE_CODE (vdef) == SSA_NAME
11886 94417 : && !SSA_NAME_IS_DEFAULT_DEF (vdef))
11887 : {
11888 19631 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11889 19631 : if (!last_stmt)
11890 : last_stmt = vstmt;
11891 10906 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11892 : last_stmt = vstmt;
11893 8718 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
11894 : ;
11895 : else
11896 0 : gcc_unreachable ();
11897 : }
11898 : }
11899 : }
11900 : /* This can happen when all children are pre-existing vectors or
11901 : constants. */
11902 206091 : if (!last_stmt)
11903 1723 : last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
11904 1723 : if (!last_stmt)
11905 : {
11906 0 : gcc_assert (seen_vector_def);
11907 0 : si = gsi_after_labels (vinfo->bbs[0]);
11908 : }
11909 206091 : else if (is_ctrl_altering_stmt (last_stmt))
11910 : {
11911 : /* We split regions to vectorize at control altering stmts
11912 : with a definition so this must be an external which
11913 : we can insert at the start of the region. */
11914 0 : si = gsi_after_labels (vinfo->bbs[0]);
11915 : }
11916 206091 : else if (is_a <bb_vec_info> (vinfo)
11917 18117 : && !SLP_TREE_PERMUTE_P (node)
11918 16655 : && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
11919 207430 : && gimple_could_trap_p (stmt_info->stmt))
11920 : {
11921 : /* We've constrained possibly trapping operations to all come
11922 : from the same basic-block, if vectorized defs would allow earlier
11923 : scheduling still force vectorized stmts to the original block.
11924 : This is only necessary for BB vectorization since for loop vect
11925 : all operations are in a single BB and scalar stmt based
11926 : placement doesn't play well with epilogue vectorization. */
11927 53 : gcc_assert (dominated_by_p (CDI_DOMINATORS,
11928 : gimple_bb (stmt_info->stmt),
11929 : gimple_bb (last_stmt)));
11930 53 : si = gsi_after_labels (gimple_bb (stmt_info->stmt));
11931 : }
11932 206038 : else if (is_a <gphi *> (last_stmt))
11933 14410 : si = gsi_after_labels (gimple_bb (last_stmt));
11934 : else
11935 : {
11936 191628 : si = gsi_for_stmt (last_stmt);
11937 191628 : gsi_next (&si);
11938 :
11939 191628 : if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
11940 : {
11941 : /* Avoid scheduling stmts to random places in the CFG, any
11942 : stmt dominance check we performed is possibly wrong as UIDs
11943 : are not initialized for all of the function for loop
11944 : vectorization. Instead append to the loop preheader. */
11945 173780 : if ((LOOP_VINFO_LOOP (loop_vinfo)->header
11946 173780 : != gimple_bb (last_stmt))
11947 176997 : && dominated_by_p (CDI_DOMINATORS,
11948 : LOOP_VINFO_LOOP (loop_vinfo)->header,
11949 3217 : gimple_bb (last_stmt)))
11950 1406 : si = gsi_end_bb (loop_preheader_edge
11951 703 : (LOOP_VINFO_LOOP (loop_vinfo))->src);
11952 : /* Avoid scheduling internal defs outside of the loop when
11953 : we might have only implicitly tracked loop mask/len defs. */
11954 74 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
11955 173780 : || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
11956 : {
11957 74 : gimple_stmt_iterator si2
11958 74 : = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
11959 74 : if ((gsi_end_p (si2)
11960 0 : && (LOOP_VINFO_LOOP (loop_vinfo)->header
11961 0 : != gimple_bb (last_stmt))
11962 0 : && dominated_by_p (CDI_DOMINATORS,
11963 : LOOP_VINFO_LOOP (loop_vinfo)->header,
11964 0 : gimple_bb (last_stmt)))
11965 74 : || (!gsi_end_p (si2)
11966 74 : && last_stmt != *si2
11967 72 : && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
11968 3 : si = si2;
11969 : }
11970 : }
11971 : }
11972 : }
11973 :
11974 968372 : if (dump_enabled_p ())
11975 : {
11976 71382 : if (stmt_info)
11977 71329 : dump_printf_loc (MSG_NOTE, vect_location,
11978 : "------>vectorizing SLP node starting from: %G",
11979 : stmt_info->stmt);
11980 : else
11981 : {
11982 53 : dump_printf_loc (MSG_NOTE, vect_location,
11983 : "------>vectorizing SLP node:\n");
11984 53 : vect_print_slp_tree (MSG_NOTE, vect_location, node);
11985 : }
11986 : }
11987 968372 : vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
11988 : }
11989 :
11990 : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
11991 : For loop vectorization this is done in vectorizable_call, but for SLP
11992 : it needs to be deferred until end of vect_schedule_slp, because multiple
11993 : SLP instances may refer to the same scalar stmt. */
11994 :
11995 : static void
11996 597687 : vect_remove_slp_scalar_calls (vec_info *vinfo,
11997 : slp_tree node, hash_set<slp_tree> &visited)
11998 : {
11999 597687 : gimple *new_stmt;
12000 597687 : gimple_stmt_iterator gsi;
12001 597687 : int i;
12002 597687 : slp_tree child;
12003 597687 : tree lhs;
12004 597687 : stmt_vec_info stmt_info;
12005 :
12006 597687 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
12007 187200 : return;
12008 :
12009 453745 : if (visited.add (node))
12010 : return;
12011 :
12012 918687 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
12013 508200 : vect_remove_slp_scalar_calls (vinfo, child, visited);
12014 :
12015 1299795 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
12016 : {
12017 482953 : if (!stmt_info)
12018 3974 : continue;
12019 478979 : stmt_info = vect_orig_stmt (stmt_info);
12020 478979 : gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
12021 5231 : if (!stmt || gimple_bb (stmt) == NULL)
12022 473786 : continue;
12023 5193 : lhs = gimple_call_lhs (stmt);
12024 5193 : if (lhs)
12025 4585 : new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
12026 : else
12027 608 : new_stmt = gimple_build_nop ();
12028 5193 : unlink_stmt_vdef (stmt_info->stmt);
12029 5193 : gsi = gsi_for_stmt (stmt);
12030 5193 : vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
12031 5193 : if (lhs)
12032 4585 : SSA_NAME_DEF_STMT (lhs) = new_stmt;
12033 : }
12034 : }
12035 :
12036 : static void
12037 89487 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
12038 : {
12039 89487 : hash_set<slp_tree> visited;
12040 89487 : vect_remove_slp_scalar_calls (vinfo, node, visited);
12041 89487 : }
12042 :
12043 : /* Vectorize the instance root. */
12044 :
12045 : void
12046 10896 : vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
12047 : {
12048 10896 : gassign *rstmt = NULL;
12049 :
12050 10896 : if (instance->kind == slp_inst_kind_ctor)
12051 : {
12052 5206 : if (SLP_TREE_VEC_DEFS (node).length () == 1)
12053 : {
12054 5169 : tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
12055 5169 : tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
12056 5169 : if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
12057 5169 : TREE_TYPE (vect_lhs)))
12058 0 : vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
12059 : vect_lhs);
12060 5169 : rstmt = gimple_build_assign (root_lhs, vect_lhs);
12061 : }
12062 : else
12063 : {
12064 37 : gcc_assert (SLP_TREE_VEC_DEFS (node).length () > 1);
12065 37 : tree child_def;
12066 37 : int j;
12067 37 : vec<constructor_elt, va_gc> *v;
12068 37 : vec_alloc (v, SLP_TREE_VEC_DEFS (node).length ());
12069 :
12070 : /* A CTOR can handle V16HI composition from VNx8HI so we
12071 : do not need to convert vector elements if the types
12072 : do not match. */
12073 111 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
12074 74 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
12075 37 : tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
12076 37 : tree rtype
12077 37 : = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
12078 37 : tree r_constructor = build_constructor (rtype, v);
12079 37 : rstmt = gimple_build_assign (lhs, r_constructor);
12080 : }
12081 : }
12082 5690 : else if (instance->kind == slp_inst_kind_bb_reduc)
12083 : {
12084 : /* Largely inspired by reduction chain epilogue handling in
12085 : vect_create_epilog_for_reduction. */
12086 4126 : vec<tree> vec_defs = vNULL;
12087 4126 : vect_get_slp_defs (node, &vec_defs);
12088 4126 : enum tree_code reduc_code
12089 4126 : = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
12090 : /* ??? We actually have to reflect signs somewhere. */
12091 4126 : if (reduc_code == MINUS_EXPR)
12092 0 : reduc_code = PLUS_EXPR;
12093 4126 : gimple_seq epilogue = NULL;
12094 : /* We may end up with more than one vector result, reduce them
12095 : to one vector. */
12096 4126 : tree vec_def = vec_defs[0];
12097 4126 : tree vectype = TREE_TYPE (vec_def);
12098 4126 : tree compute_vectype = vectype;
12099 4126 : bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
12100 3927 : && TYPE_OVERFLOW_UNDEFINED (vectype)
12101 6886 : && operation_can_overflow (reduc_code));
12102 2619 : if (pun_for_overflow_p)
12103 : {
12104 2619 : compute_vectype = unsigned_type_for (vectype);
12105 2619 : vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
12106 : compute_vectype, vec_def);
12107 : }
12108 6514 : for (unsigned i = 1; i < vec_defs.length (); ++i)
12109 : {
12110 2388 : tree def = vec_defs[i];
12111 2388 : if (pun_for_overflow_p)
12112 2285 : def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
12113 : compute_vectype, def);
12114 2388 : vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
12115 : vec_def, def);
12116 : }
12117 4126 : vec_defs.release ();
12118 : /* ??? Support other schemes than direct internal fn. */
12119 4126 : internal_fn reduc_fn;
12120 4126 : if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
12121 4126 : || reduc_fn == IFN_LAST)
12122 0 : gcc_unreachable ();
12123 4126 : tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
12124 4126 : TREE_TYPE (compute_vectype), vec_def);
12125 4126 : if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
12126 : {
12127 2565 : tree rem_def = NULL_TREE;
12128 11907 : for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
12129 : {
12130 9342 : def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
12131 9342 : if (!rem_def)
12132 : rem_def = def;
12133 : else
12134 6777 : rem_def = gimple_build (&epilogue, reduc_code,
12135 6777 : TREE_TYPE (scalar_def),
12136 : rem_def, def);
12137 : }
12138 2565 : scalar_def = gimple_build (&epilogue, reduc_code,
12139 2565 : TREE_TYPE (scalar_def),
12140 : scalar_def, rem_def);
12141 : }
12142 4126 : scalar_def = gimple_convert (&epilogue,
12143 4126 : TREE_TYPE (vectype), scalar_def);
12144 4126 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
12145 4126 : gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
12146 4126 : gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
12147 4126 : update_stmt (gsi_stmt (rgsi));
12148 4126 : return;
12149 : }
12150 1564 : else if (instance->kind == slp_inst_kind_gcond)
12151 : {
12152 : /* Only support a single root for now as we can't codegen CFG yet and so we
12153 : can't support lane > 1 at this time. */
12154 1564 : gcc_assert (instance->root_stmts.length () == 1);
12155 1564 : auto root_stmt_info = instance->root_stmts[0];
12156 1564 : auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
12157 1564 : gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
12158 1564 : gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
12159 1564 : bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
12160 : root_stmt_info, &rgsi, node, NULL);
12161 1564 : gcc_assert (res);
12162 1564 : return;
12163 : }
12164 : else
12165 0 : gcc_unreachable ();
12166 :
12167 5206 : gcc_assert (rstmt);
12168 :
12169 5206 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
12170 5206 : gsi_replace (&rgsi, rstmt, true);
12171 : }
12172 :
12173 : struct slp_scc_info
12174 : {
12175 : bool on_stack;
12176 : int dfs;
12177 : int lowlink;
12178 : };
12179 :
12180 : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
12181 :
12182 : static void
12183 1464892 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
12184 : hash_map<slp_tree, slp_scc_info> &scc_info,
12185 : int &maxdfs, vec<slp_tree> &stack)
12186 : {
12187 1464892 : bool existed_p;
12188 1464892 : slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
12189 1464892 : gcc_assert (!existed_p);
12190 1464892 : info->dfs = maxdfs;
12191 1464892 : info->lowlink = maxdfs;
12192 1464892 : maxdfs++;
12193 :
12194 : /* Leaf. */
12195 1464892 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
12196 : {
12197 496520 : info->on_stack = false;
12198 496520 : vect_schedule_slp_node (vinfo, node, instance);
12199 1024465 : return;
12200 : }
12201 :
12202 968372 : info->on_stack = true;
12203 968372 : stack.safe_push (node);
12204 :
12205 968372 : unsigned i;
12206 968372 : slp_tree child;
12207 : /* DFS recurse. */
12208 1998077 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
12209 : {
12210 1029705 : if (!child)
12211 54894 : continue;
12212 974811 : slp_scc_info *child_info = scc_info.get (child);
12213 974811 : if (!child_info)
12214 : {
12215 885754 : vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
12216 : /* Recursion might have re-allocated the node. */
12217 885754 : info = scc_info.get (node);
12218 885754 : child_info = scc_info.get (child);
12219 885754 : info->lowlink = MIN (info->lowlink, child_info->lowlink);
12220 : }
12221 89057 : else if (child_info->on_stack)
12222 25273 : info->lowlink = MIN (info->lowlink, child_info->dfs);
12223 : }
12224 968372 : if (info->lowlink != info->dfs)
12225 : return;
12226 :
12227 936947 : auto_vec<slp_tree, 4> phis_to_fixup;
12228 :
12229 : /* Singleton. */
12230 936947 : if (stack.last () == node)
12231 : {
12232 913341 : stack.pop ();
12233 913341 : info->on_stack = false;
12234 913341 : vect_schedule_slp_node (vinfo, node, instance);
12235 913341 : if (!SLP_TREE_PERMUTE_P (node)
12236 913341 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
12237 30335 : phis_to_fixup.quick_push (node);
12238 : }
12239 : else
12240 : {
12241 : /* SCC. */
12242 23606 : int last_idx = stack.length () - 1;
12243 55031 : while (stack[last_idx] != node)
12244 31425 : last_idx--;
12245 : /* We can break the cycle at PHIs who have at least one child
12246 : code generated. Then we could re-start the DFS walk until
12247 : all nodes in the SCC are covered (we might have new entries
12248 : for only back-reachable nodes). But it's simpler to just
12249 : iterate and schedule those that are ready. */
12250 23606 : unsigned todo = stack.length () - last_idx;
12251 23945 : do
12252 : {
12253 104705 : for (int idx = stack.length () - 1; idx >= last_idx; --idx)
12254 : {
12255 56815 : slp_tree entry = stack[idx];
12256 56815 : if (!entry)
12257 956 : continue;
12258 55859 : bool phi = (!SLP_TREE_PERMUTE_P (entry)
12259 55859 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
12260 55859 : bool ready = !phi;
12261 141335 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
12262 110326 : if (!child)
12263 : {
12264 22728 : gcc_assert (phi);
12265 : ready = true;
12266 : break;
12267 : }
12268 87598 : else if (scc_info.get (child)->on_stack)
12269 : {
12270 23823 : if (!phi)
12271 : {
12272 : ready = false;
12273 : break;
12274 : }
12275 : }
12276 : else
12277 : {
12278 63775 : if (phi)
12279 : {
12280 : ready = true;
12281 : break;
12282 : }
12283 : }
12284 33131 : if (ready)
12285 : {
12286 55031 : vect_schedule_slp_node (vinfo, entry, instance);
12287 55031 : scc_info.get (entry)->on_stack = false;
12288 55031 : stack[idx] = NULL;
12289 55031 : todo--;
12290 55031 : if (phi)
12291 24052 : phis_to_fixup.safe_push (entry);
12292 : }
12293 : }
12294 : }
12295 23945 : while (todo != 0);
12296 :
12297 : /* Pop the SCC. */
12298 23606 : stack.truncate (last_idx);
12299 : }
12300 :
12301 : /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
12302 : slp_tree phi_node;
12303 1928281 : FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
12304 : {
12305 54387 : gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
12306 54387 : edge_iterator ei;
12307 54387 : edge e;
12308 171407 : FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
12309 : {
12310 117020 : unsigned dest_idx = e->dest_idx;
12311 117020 : child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
12312 117020 : if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
12313 65868 : continue;
12314 51152 : unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
12315 : /* Simply fill all args. */
12316 51152 : if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
12317 : != vect_first_order_recurrence)
12318 109985 : for (unsigned i = 0; i < n; ++i)
12319 : {
12320 58876 : tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
12321 58876 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
12322 58876 : add_phi_arg (phi, vect_get_slp_vect_def (child, i),
12323 : e, gimple_phi_arg_location (phi, dest_idx));
12324 : }
12325 : else
12326 : {
12327 : /* Unless it is a first order recurrence which needs
12328 : args filled in for both the PHI node and the permutes. */
12329 43 : gimple *perm
12330 43 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
12331 43 : gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
12332 43 : add_phi_arg (as_a <gphi *> (rphi),
12333 : vect_get_slp_vect_def (child, n - 1),
12334 : e, gimple_phi_arg_location (phi, dest_idx));
12335 123 : for (unsigned i = 0; i < n; ++i)
12336 : {
12337 80 : gimple *perm
12338 80 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
12339 80 : if (i > 0)
12340 37 : gimple_assign_set_rhs1 (perm,
12341 : vect_get_slp_vect_def (child, i - 1));
12342 80 : gimple_assign_set_rhs2 (perm,
12343 : vect_get_slp_vect_def (child, i));
12344 80 : update_stmt (perm);
12345 : }
12346 : }
12347 : }
12348 : }
12349 936947 : }
12350 :
12351 : /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
12352 :
12353 : void
12354 540591 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
12355 : {
12356 540591 : slp_instance instance;
12357 540591 : unsigned int i;
12358 :
12359 540591 : hash_map<slp_tree, slp_scc_info> scc_info;
12360 540591 : int maxdfs = 0;
12361 1119836 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
12362 : {
12363 579245 : slp_tree node = SLP_INSTANCE_TREE (instance);
12364 579245 : if (dump_enabled_p ())
12365 : {
12366 15999 : dump_printf_loc (MSG_NOTE, vect_location,
12367 : "Vectorizing SLP tree:\n");
12368 : /* ??? Dump all? */
12369 15999 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
12370 449 : dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
12371 449 : SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
12372 15999 : vect_print_slp_graph (MSG_NOTE, vect_location,
12373 : SLP_INSTANCE_TREE (instance));
12374 : }
12375 : /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
12376 : have a PHI be the node breaking the cycle. */
12377 579245 : auto_vec<slp_tree> stack;
12378 579245 : if (!scc_info.get (node))
12379 579138 : vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
12380 :
12381 579245 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
12382 10896 : vectorize_slp_instance_root_stmt (vinfo, node, instance);
12383 :
12384 579245 : if (dump_enabled_p ())
12385 15999 : dump_printf_loc (MSG_NOTE, vect_location,
12386 : "vectorizing stmts using SLP.\n");
12387 579245 : }
12388 :
12389 1660427 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
12390 : {
12391 579245 : slp_tree root = SLP_INSTANCE_TREE (instance);
12392 579245 : stmt_vec_info store_info;
12393 579245 : unsigned int j;
12394 :
12395 : /* Remove scalar call stmts. Do not do this for basic-block
12396 : vectorization as not all uses may be vectorized.
12397 : ??? Why should this be necessary? DCE should be able to
12398 : remove the stmts itself.
12399 : ??? For BB vectorization we can as well remove scalar
12400 : stmts starting from the SLP tree root if they have no
12401 : uses. */
12402 579245 : if (is_a <loop_vec_info> (vinfo))
12403 89487 : vect_remove_slp_scalar_calls (vinfo, root);
12404 :
12405 : /* Remove vectorized stores original scalar stmts. */
12406 2586703 : for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
12407 : {
12408 1464276 : if (!store_info
12409 1464262 : || !STMT_VINFO_DATA_REF (store_info)
12410 1436786 : || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
12411 : break;
12412 :
12413 1428213 : store_info = vect_orig_stmt (store_info);
12414 : /* Free the attached stmt_vec_info and remove the stmt. */
12415 1428213 : vinfo->remove_stmt (store_info);
12416 :
12417 : /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
12418 : to not crash in vect_free_slp_tree later. */
12419 1428213 : if (SLP_TREE_REPRESENTATIVE (root) == store_info)
12420 542851 : SLP_TREE_REPRESENTATIVE (root) = NULL;
12421 : }
12422 : }
12423 540591 : }
|